blob: e158ef9f8579717b40beeb30734a7488694cb1c8 [file] [log] [blame]
// Copyright 2019, VIXL authors
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of ARM Limited nor the names of its contributors may be
// used to endorse or promote products derived from this software without
// specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <cfloat>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <functional>
#include <sys/mman.h>
#include <unistd.h>
#include "test-runner.h"
#include "test-utils.h"
#include "aarch64/cpu-aarch64.h"
#include "aarch64/disasm-aarch64.h"
#include "aarch64/macro-assembler-aarch64.h"
#include "aarch64/simulator-aarch64.h"
#include "aarch64/test-utils-aarch64.h"
#include "test-assembler-aarch64.h"
#define TEST_SVE(name) TEST_SVE_INNER("ASM", name)
namespace vixl {
namespace aarch64 {
// Conveniently initialise P registers with scalar bit patterns. The destination
// lane size is ignored. This is optimised for call-site clarity, not generated
// code quality.
//
// Usage:
//
// Initialise(&masm, p0, 0x1234); // Sets p0 = 0b'0001'0010'0011'0100
void Initialise(MacroAssembler* masm,
const PRegister& pd,
uint64_t value3,
uint64_t value2,
uint64_t value1,
uint64_t value0) {
// Generate a literal pool, as in the array form.
UseScratchRegisterScope temps(masm);
Register temp = temps.AcquireX();
Label data;
Label done;
masm->Adr(temp, &data);
masm->Ldr(pd, SVEMemOperand(temp));
masm->B(&done);
{
ExactAssemblyScope total(masm, kPRegMaxSizeInBytes);
masm->bind(&data);
masm->dc64(value0);
masm->dc64(value1);
masm->dc64(value2);
masm->dc64(value3);
}
masm->Bind(&done);
}
void Initialise(MacroAssembler* masm,
const PRegister& pd,
uint64_t value2,
uint64_t value1,
uint64_t value0) {
Initialise(masm, pd, 0, value2, value1, value0);
}
void Initialise(MacroAssembler* masm,
const PRegister& pd,
uint64_t value1,
uint64_t value0) {
Initialise(masm, pd, 0, 0, value1, value0);
}
void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value0) {
Initialise(masm, pd, 0, 0, 0, value0);
}
// Conveniently initialise P registers by lane. This is optimised for call-site
// clarity, not generated code quality.
//
// Usage:
//
// int values[] = { 0x0, 0x1, 0x2 };
// Initialise(&masm, p0.VnS(), values); // Sets p0 = 0b'0000'0001'0010
//
// The rightmost (highest-indexed) array element maps to the lowest-numbered
// lane. Unspecified lanes are set to 0 (inactive).
//
// Each element of the `values` array is mapped onto a lane in `pd`. The
// architecture only respects the lower bit, and writes zero the upper bits, but
// other (encodable) values can be specified if required by the test.
template <typename T, size_t N>
void Initialise(MacroAssembler* masm,
const PRegisterWithLaneSize& pd,
const T (&values)[N]) {
// Turn the array into 64-bit chunks.
uint64_t chunks[4] = {0, 0, 0, 0};
VIXL_STATIC_ASSERT(sizeof(chunks) == kPRegMaxSizeInBytes);
int p_bits_per_lane = pd.GetLaneSizeInBits() / kZRegBitsPerPRegBit;
VIXL_ASSERT((64 % p_bits_per_lane) == 0);
VIXL_ASSERT((N * p_bits_per_lane) <= kPRegMaxSize);
uint64_t p_lane_mask = GetUintMask(p_bits_per_lane);
VIXL_STATIC_ASSERT(N <= kPRegMaxSize);
size_t bit = 0;
for (int n = static_cast<int>(N - 1); n >= 0; n--) {
VIXL_ASSERT(bit < (sizeof(chunks) * kBitsPerByte));
uint64_t value = values[n] & p_lane_mask;
chunks[bit / 64] |= value << (bit % 64);
bit += p_bits_per_lane;
}
Initialise(masm, pd, chunks[3], chunks[2], chunks[1], chunks[0]);
}
// Ensure that basic test infrastructure works.
TEST_SVE(sve_test_infrastructure_z) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Mov(x0, 0x0123456789abcdef);
// Test basic `Insr` behaviour.
__ Insr(z0.VnB(), 1);
__ Insr(z0.VnB(), 2);
__ Insr(z0.VnB(), x0);
__ Insr(z0.VnB(), -42);
__ Insr(z0.VnB(), 0);
// Test array inputs.
int z1_inputs[] = {3, 4, 5, -42, 0};
InsrHelper(&masm, z1.VnH(), z1_inputs);
// Test that sign-extension works as intended for various lane sizes.
__ Dup(z2.VnD(), 0); // Clear the register first.
__ Insr(z2.VnB(), -42); // 0xd6
__ Insr(z2.VnB(), 0xfe); // 0xfe
__ Insr(z2.VnH(), -42); // 0xffd6
__ Insr(z2.VnH(), 0xfedc); // 0xfedc
__ Insr(z2.VnS(), -42); // 0xffffffd6
__ Insr(z2.VnS(), 0xfedcba98); // 0xfedcba98
// Use another register for VnD(), so we can support 128-bit Z registers.
__ Insr(z3.VnD(), -42); // 0xffffffffffffffd6
__ Insr(z3.VnD(), 0xfedcba9876543210); // 0xfedcba9876543210
END();
if (CAN_RUN()) {
RUN();
// Test that array checks work properly on a register initialised
// lane-by-lane.
int z0_inputs_b[] = {0x01, 0x02, 0xef, 0xd6, 0x00};
ASSERT_EQUAL_SVE(z0_inputs_b, z0.VnB());
// Test that lane-by-lane checks work properly on a register initialised
// by array.
for (size_t i = 0; i < ArrayLength(z1_inputs); i++) {
// The rightmost (highest-indexed) array element maps to the
// lowest-numbered lane.
int lane = static_cast<int>(ArrayLength(z1_inputs) - i - 1);
ASSERT_EQUAL_SVE_LANE(z1_inputs[i], z1.VnH(), lane);
}
uint64_t z2_inputs_d[] = {0x0000d6feffd6fedc, 0xffffffd6fedcba98};
ASSERT_EQUAL_SVE(z2_inputs_d, z2.VnD());
uint64_t z3_inputs_d[] = {0xffffffffffffffd6, 0xfedcba9876543210};
ASSERT_EQUAL_SVE(z3_inputs_d, z3.VnD());
}
}
// Ensure that basic test infrastructure works.
TEST_SVE(sve_test_infrastructure_p) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Simple cases: move boolean (0 or 1) values.
int p0_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
Initialise(&masm, p0.VnB(), p0_inputs);
int p1_inputs[] = {1, 0, 1, 1, 0, 1, 1, 1};
Initialise(&masm, p1.VnH(), p1_inputs);
int p2_inputs[] = {1, 1, 0, 1};
Initialise(&masm, p2.VnS(), p2_inputs);
int p3_inputs[] = {0, 1};
Initialise(&masm, p3.VnD(), p3_inputs);
// Advanced cases: move numeric value into architecturally-ignored bits.
// B-sized lanes get one bit in a P register, so there are no ignored bits.
// H-sized lanes get two bits in a P register.
int p4_inputs[] = {0x3, 0x2, 0x1, 0x0, 0x1, 0x2, 0x3};
Initialise(&masm, p4.VnH(), p4_inputs);
// S-sized lanes get four bits in a P register.
int p5_inputs[] = {0xc, 0x7, 0x9, 0x6, 0xf};
Initialise(&masm, p5.VnS(), p5_inputs);
// D-sized lanes get eight bits in a P register.
int p6_inputs[] = {0x81, 0xcc, 0x55};
Initialise(&masm, p6.VnD(), p6_inputs);
// The largest possible P register has 32 bytes.
int p7_inputs[] = {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f};
Initialise(&masm, p7.VnD(), p7_inputs);
END();
if (CAN_RUN()) {
RUN();
// Test that lane-by-lane checks work properly. The rightmost
// (highest-indexed) array element maps to the lowest-numbered lane.
for (size_t i = 0; i < ArrayLength(p0_inputs); i++) {
int lane = static_cast<int>(ArrayLength(p0_inputs) - i - 1);
ASSERT_EQUAL_SVE_LANE(p0_inputs[i], p0.VnB(), lane);
}
for (size_t i = 0; i < ArrayLength(p1_inputs); i++) {
int lane = static_cast<int>(ArrayLength(p1_inputs) - i - 1);
ASSERT_EQUAL_SVE_LANE(p1_inputs[i], p1.VnH(), lane);
}
for (size_t i = 0; i < ArrayLength(p2_inputs); i++) {
int lane = static_cast<int>(ArrayLength(p2_inputs) - i - 1);
ASSERT_EQUAL_SVE_LANE(p2_inputs[i], p2.VnS(), lane);
}
for (size_t i = 0; i < ArrayLength(p3_inputs); i++) {
int lane = static_cast<int>(ArrayLength(p3_inputs) - i - 1);
ASSERT_EQUAL_SVE_LANE(p3_inputs[i], p3.VnD(), lane);
}
// Test that array checks work properly on predicates initialised with a
// possibly-different lane size.
// 0b...11'10'01'00'01'10'11
int p4_expected[] = {0x39, 0x1b};
ASSERT_EQUAL_SVE(p4_expected, p4.VnD());
ASSERT_EQUAL_SVE(p5_inputs, p5.VnS());
// 0b...10000001'11001100'01010101
int p6_expected[] = {2, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p6_expected, p6.VnH());
// 0b...10011100'10011101'10011110'10011111
int p7_expected[] = {1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
}
}
// Test that writes to V registers clear the high bits of the corresponding Z
// register.
TEST_SVE(sve_v_write_clear) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON,
CPUFeatures::kFP,
CPUFeatures::kSVE);
START();
// The Simulator has two mechanisms for writing V registers:
// - Write*Register, calling through to SimRegisterBase::Write.
// - LogicVRegister::ClearForWrite followed by one or more lane updates.
// Try to cover both variants.
// Prepare some known inputs.
uint8_t data[kQRegSizeInBytes];
for (size_t i = 0; i < kQRegSizeInBytes; i++) {
data[i] = 42 + i;
}
__ Mov(x10, reinterpret_cast<uintptr_t>(data));
__ Fmov(d30, 42.0);
// Use Index to label the lane indices, so failures are easy to detect and
// diagnose.
__ Index(z0.VnB(), 0, 1);
__ Index(z1.VnB(), 0, 1);
__ Index(z2.VnB(), 0, 1);
__ Index(z3.VnB(), 0, 1);
__ Index(z4.VnB(), 0, 1);
__ Index(z10.VnB(), 0, -1);
__ Index(z11.VnB(), 0, -1);
__ Index(z12.VnB(), 0, -1);
__ Index(z13.VnB(), 0, -1);
__ Index(z14.VnB(), 0, -1);
// Instructions using Write*Register (and SimRegisterBase::Write).
__ Ldr(b0, MemOperand(x10));
__ Fcvt(h1, d30);
__ Fmov(s2, 1.5f);
__ Fmov(d3, d30);
__ Ldr(q4, MemOperand(x10));
// Instructions using LogicVRegister::ClearForWrite.
// These also (incidentally) test that across-lane instructions correctly
// ignore the high-order Z register lanes.
__ Sminv(b10, v10.V16B());
__ Addv(h11, v11.V4H());
__ Saddlv(s12, v12.V8H());
__ Dup(v13.V8B(), b13, kDRegSizeInBytes);
__ Uaddl(v14.V8H(), v14.V8B(), v14.V8B());
END();
if (CAN_RUN()) {
RUN();
// Check the Q part first.
ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000002a, v0);
ASSERT_EQUAL_128(0x0000000000000000, 0x0000000000005140, v1); // 42.0 (f16)
ASSERT_EQUAL_128(0x0000000000000000, 0x000000003fc00000, v2); // 1.5 (f32)
ASSERT_EQUAL_128(0x0000000000000000, 0x4045000000000000, v3); // 42.0 (f64)
ASSERT_EQUAL_128(0x3938373635343332, 0x31302f2e2d2c2b2a, v4);
ASSERT_EQUAL_128(0x0000000000000000, 0x00000000000000f1, v10); // -15
// 0xf9fa + 0xfbfc + 0xfdfe + 0xff00 -> 0xf2f4
ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000f2f4, v11);
// 0xfffff1f2 + 0xfffff3f4 + ... + 0xfffffdfe + 0xffffff00 -> 0xffffc6c8
ASSERT_EQUAL_128(0x0000000000000000, 0x00000000ffffc6c8, v12);
ASSERT_EQUAL_128(0x0000000000000000, 0xf8f8f8f8f8f8f8f8, v13); // [-8] x 8
// [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
// + [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
// -> [0x01f2, 0x01f4, 0x01f6, 0x01f8, 0x01fa, 0x01fc, 0x01fe, 0x0000]
ASSERT_EQUAL_128(0x01f201f401f601f8, 0x01fa01fc01fe0000, v14);
// Check that the upper lanes are all clear.
for (int i = kQRegSizeInBytes; i < core.GetSVELaneCount(kBRegSize); i++) {
ASSERT_EQUAL_SVE_LANE(0x00, z0.VnB(), i);
ASSERT_EQUAL_SVE_LANE(0x00, z1.VnB(), i);
ASSERT_EQUAL_SVE_LANE(0x00, z2.VnB(), i);
ASSERT_EQUAL_SVE_LANE(0x00, z3.VnB(), i);
ASSERT_EQUAL_SVE_LANE(0x00, z4.VnB(), i);
ASSERT_EQUAL_SVE_LANE(0x00, z10.VnB(), i);
ASSERT_EQUAL_SVE_LANE(0x00, z11.VnB(), i);
ASSERT_EQUAL_SVE_LANE(0x00, z12.VnB(), i);
ASSERT_EQUAL_SVE_LANE(0x00, z13.VnB(), i);
ASSERT_EQUAL_SVE_LANE(0x00, z14.VnB(), i);
}
}
}
static void MlaMlsHelper(Test* config, unsigned lane_size_in_bits) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int zd_inputs[] = {0xbb, 0xcc, 0xdd, 0xee};
int za_inputs[] = {-39, 1, -3, 2};
int zn_inputs[] = {-5, -20, 9, 8};
int zm_inputs[] = {9, -5, 4, 5};
ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
ZRegister za = z1.WithLaneSize(lane_size_in_bits);
ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
// TODO: Use a simple `Dup` once it accepts arbitrary immediates.
InsrHelper(&masm, zd, zd_inputs);
InsrHelper(&masm, za, za_inputs);
InsrHelper(&masm, zn, zn_inputs);
InsrHelper(&masm, zm, zm_inputs);
int p0_inputs[] = {1, 1, 0, 1};
int p1_inputs[] = {1, 0, 1, 1};
int p2_inputs[] = {0, 1, 1, 1};
int p3_inputs[] = {1, 1, 1, 0};
Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), p0_inputs);
Initialise(&masm, p1.WithLaneSize(lane_size_in_bits), p1_inputs);
Initialise(&masm, p2.WithLaneSize(lane_size_in_bits), p2_inputs);
Initialise(&masm, p3.WithLaneSize(lane_size_in_bits), p3_inputs);
// The Mla macro automatically selects between mla, mad and movprfx + mla
// based on what registers are aliased.
ZRegister mla_da_result = z10.WithLaneSize(lane_size_in_bits);
ZRegister mla_dn_result = z11.WithLaneSize(lane_size_in_bits);
ZRegister mla_dm_result = z12.WithLaneSize(lane_size_in_bits);
ZRegister mla_d_result = z13.WithLaneSize(lane_size_in_bits);
__ Mov(mla_da_result, za);
__ Mla(mla_da_result, p0.Merging(), mla_da_result, zn, zm);
__ Mov(mla_dn_result, zn);
__ Mla(mla_dn_result, p1.Merging(), za, mla_dn_result, zm);
__ Mov(mla_dm_result, zm);
__ Mla(mla_dm_result, p2.Merging(), za, zn, mla_dm_result);
__ Mov(mla_d_result, zd);
__ Mla(mla_d_result, p3.Merging(), za, zn, zm);
// The Mls macro automatically selects between mls, msb and movprfx + mls
// based on what registers are aliased.
ZRegister mls_da_result = z20.WithLaneSize(lane_size_in_bits);
ZRegister mls_dn_result = z21.WithLaneSize(lane_size_in_bits);
ZRegister mls_dm_result = z22.WithLaneSize(lane_size_in_bits);
ZRegister mls_d_result = z23.WithLaneSize(lane_size_in_bits);
__ Mov(mls_da_result, za);
__ Mls(mls_da_result, p0.Merging(), mls_da_result, zn, zm);
__ Mov(mls_dn_result, zn);
__ Mls(mls_dn_result, p1.Merging(), za, mls_dn_result, zm);
__ Mov(mls_dm_result, zm);
__ Mls(mls_dm_result, p2.Merging(), za, zn, mls_dm_result);
__ Mov(mls_d_result, zd);
__ Mls(mls_d_result, p3.Merging(), za, zn, zm);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits));
ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits));
int mla[] = {-84, 101, 33, 42};
int mls[] = {6, -99, -39, -38};
int mla_da_expected[] = {mla[0], mla[1], za_inputs[2], mla[3]};
ASSERT_EQUAL_SVE(mla_da_expected, mla_da_result);
int mla_dn_expected[] = {mla[0], zn_inputs[1], mla[2], mla[3]};
ASSERT_EQUAL_SVE(mla_dn_expected, mla_dn_result);
int mla_dm_expected[] = {zm_inputs[0], mla[1], mla[2], mla[3]};
ASSERT_EQUAL_SVE(mla_dm_expected, mla_dm_result);
int mla_d_expected[] = {mla[0], mla[1], mla[2], zd_inputs[3]};
ASSERT_EQUAL_SVE(mla_d_expected, mla_d_result);
int mls_da_expected[] = {mls[0], mls[1], za_inputs[2], mls[3]};
ASSERT_EQUAL_SVE(mls_da_expected, mls_da_result);
int mls_dn_expected[] = {mls[0], zn_inputs[1], mls[2], mls[3]};
ASSERT_EQUAL_SVE(mls_dn_expected, mls_dn_result);
int mls_dm_expected[] = {zm_inputs[0], mls[1], mls[2], mls[3]};
ASSERT_EQUAL_SVE(mls_dm_expected, mls_dm_result);
int mls_d_expected[] = {mls[0], mls[1], mls[2], zd_inputs[3]};
ASSERT_EQUAL_SVE(mls_d_expected, mls_d_result);
}
}
TEST_SVE(sve_mla_mls_b) { MlaMlsHelper(config, kBRegSize); }
TEST_SVE(sve_mla_mls_h) { MlaMlsHelper(config, kHRegSize); }
TEST_SVE(sve_mla_mls_s) { MlaMlsHelper(config, kSRegSize); }
TEST_SVE(sve_mla_mls_d) { MlaMlsHelper(config, kDRegSize); }
TEST_SVE(sve_bitwise_unpredicate_logical) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t z8_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
InsrHelper(&masm, z8.VnD(), z8_inputs);
uint64_t z15_inputs[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff};
InsrHelper(&masm, z15.VnD(), z15_inputs);
__ And(z1.VnD(), z8.VnD(), z15.VnD());
__ Bic(z2.VnD(), z8.VnD(), z15.VnD());
__ Eor(z3.VnD(), z8.VnD(), z15.VnD());
__ Orr(z4.VnD(), z8.VnD(), z15.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t z1_expected[] = {0xfedcaa8854540000, 0x0000454588aacdef};
uint64_t z2_expected[] = {0x0000101022003210, 0x0123002201010000};
uint64_t z3_expected[] = {0x01235476ab89fedc, 0xcdef98ba67453210};
uint64_t z4_expected[] = {0xfffffefeffddfedc, 0xcdefddffefefffff};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
}
}
TEST_SVE(sve_last_r) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
START();
__ Pfalse(p1.VnB());
int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
Initialise(&masm, p2.VnB(), p2_inputs);
Initialise(&masm, p3.VnB(), p3_inputs);
__ Ptrue(p4.VnB());
__ Index(z0.VnB(), 0x10, 1);
__ Lasta(x1, p1, z0.VnB());
__ Lastb(x2, p1, z0.VnB());
__ Lasta(x3, p2, z0.VnB());
__ Lastb(x4, p2, z0.VnB());
__ Lasta(x5, p3, z0.VnB());
__ Lastb(x6, p3, z0.VnB());
__ Lasta(x7, p4, z0.VnB());
__ Punpklo(p3.VnH(), p3.VnB());
__ Index(z0.VnH(), 0x1110, 1);
__ Lasta(x9, p1, z0.VnH());
__ Lastb(x10, p3, z0.VnH());
__ Lasta(x12, p4, z0.VnH());
__ Index(z0.VnS(), 0x11111110, 1);
__ Lastb(x13, p1, z0.VnS());
__ Lasta(x14, p2, z0.VnS());
__ Lastb(x18, p4, z0.VnS());
__ Index(z0.VnD(), 0x1111111111111110, 1);
__ Lasta(x19, p1, z0.VnD());
__ Lastb(x20, p3, z0.VnD());
__ Lasta(x21, p3, z0.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_64(0x0000000000000010, x1);
ASSERT_EQUAL_64(0x0000000000000011, x3);
ASSERT_EQUAL_64(0x0000000000000010, x4);
ASSERT_EQUAL_64(0x0000000000000019, x5);
ASSERT_EQUAL_64(0x0000000000000018, x6);
ASSERT_EQUAL_64(0x0000000000000010, x7);
ASSERT_EQUAL_64(0x0000000000001110, x9);
ASSERT_EQUAL_64(0x0000000000001110, x12);
ASSERT_EQUAL_64(0x0000000011111111, x14);
ASSERT_EQUAL_64(0x1111111111111110, x19);
int vl = core.GetSVELaneCount(kBRegSize) * 8;
switch (vl) {
case 128:
ASSERT_EQUAL_64(0x000000000000001f, x2);
ASSERT_EQUAL_64(0x0000000000001116, x10);
ASSERT_EQUAL_64(0x0000000011111113, x13);
ASSERT_EQUAL_64(0x0000000011111113, x18);
ASSERT_EQUAL_64(0x1111111111111111, x20);
ASSERT_EQUAL_64(0x1111111111111110, x21);
break;
case 512:
ASSERT_EQUAL_64(0x000000000000004f, x2);
ASSERT_EQUAL_64(0x0000000000001118, x10);
ASSERT_EQUAL_64(0x000000001111111f, x13);
ASSERT_EQUAL_64(0x000000001111111f, x18);
ASSERT_EQUAL_64(0x1111111111111112, x20);
ASSERT_EQUAL_64(0x1111111111111113, x21);
break;
case 2048:
ASSERT_EQUAL_64(0x000000000000000f, x2);
ASSERT_EQUAL_64(0x0000000000001118, x10);
ASSERT_EQUAL_64(0x000000001111114f, x13);
ASSERT_EQUAL_64(0x000000001111114f, x18);
ASSERT_EQUAL_64(0x1111111111111112, x20);
ASSERT_EQUAL_64(0x1111111111111113, x21);
break;
default:
printf("WARNING: Some tests skipped due to unexpected VL.\n");
break;
}
}
}
TEST_SVE(sve_last_v) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
START();
__ Pfalse(p1.VnB());
int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
Initialise(&masm, p2.VnB(), p2_inputs);
Initialise(&masm, p3.VnB(), p3_inputs);
__ Ptrue(p4.VnB());
__ Index(z0.VnB(), 0x10, 1);
__ Lasta(b1, p1, z0.VnB());
__ Lastb(b2, p1, z0.VnB());
__ Lasta(b3, p2, z0.VnB());
__ Lastb(b4, p2, z0.VnB());
__ Lasta(b5, p3, z0.VnB());
__ Lastb(b6, p3, z0.VnB());
__ Lasta(b7, p4, z0.VnB());
__ Punpklo(p3.VnH(), p3.VnB());
__ Index(z0.VnH(), 0x1110, 1);
__ Lasta(h9, p1, z0.VnH());
__ Lastb(h10, p3, z0.VnH());
__ Lasta(h12, p4, z0.VnH());
__ Index(z0.VnS(), 0x11111110, 1);
__ Lastb(s13, p1, z0.VnS());
__ Lasta(s14, p2, z0.VnS());
__ Lastb(s18, p4, z0.VnS());
__ Index(z0.VnD(), 0x1111111111111110, 1);
__ Lasta(d19, p1, z0.VnD());
__ Lastb(d20, p3, z0.VnD());
__ Lasta(d21, p3, z0.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_128(0, 0x0000000000000010, q1);
ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
ASSERT_EQUAL_128(0, 0x0000000000001110, q9);
ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
ASSERT_EQUAL_128(0, 0x0000000011111111, q14);
ASSERT_EQUAL_128(0, 0x1111111111111110, q19);
int vl = core.GetSVELaneCount(kBRegSize) * 8;
switch (vl) {
case 128:
ASSERT_EQUAL_128(0, 0x000000000000001f, q2);
ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
ASSERT_EQUAL_128(0, 0x0000000011111113, q13);
ASSERT_EQUAL_128(0, 0x0000000011111113, q18);
ASSERT_EQUAL_128(0, 0x1111111111111111, q20);
ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
break;
case 512:
ASSERT_EQUAL_128(0, 0x000000000000004f, q2);
ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
ASSERT_EQUAL_128(0, 0x000000001111111f, q13);
ASSERT_EQUAL_128(0, 0x000000001111111f, q18);
ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
break;
case 2048:
ASSERT_EQUAL_128(0, 0x000000000000000f, q2);
ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
ASSERT_EQUAL_128(0, 0x000000001111114f, q13);
ASSERT_EQUAL_128(0, 0x000000001111114f, q18);
ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
break;
default:
printf("WARNING: Some tests skipped due to unexpected VL.\n");
break;
}
}
}
TEST_SVE(sve_clast_r) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
START();
__ Pfalse(p1.VnB());
int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
Initialise(&masm, p2.VnB(), p2_inputs);
Initialise(&masm, p3.VnB(), p3_inputs);
__ Ptrue(p4.VnB());
__ Index(z0.VnB(), 0x10, 1);
__ Mov(x1, -1);
__ Mov(x2, -1);
__ Clasta(x1, p1, x1, z0.VnB());
__ Clastb(x2, p1, x2, z0.VnB());
__ Clasta(x3, p2, x3, z0.VnB());
__ Clastb(x4, p2, x4, z0.VnB());
__ Clasta(x5, p3, x5, z0.VnB());
__ Clastb(x6, p3, x6, z0.VnB());
__ Clasta(x7, p4, x7, z0.VnB());
__ Punpklo(p3.VnH(), p3.VnB());
__ Index(z0.VnH(), 0x1110, 1);
__ Mov(x9, -1);
__ Clasta(x9, p1, x9, z0.VnH());
__ Clastb(x10, p3, x10, z0.VnH());
__ Clasta(x12, p4, x12, z0.VnH());
__ Index(z0.VnS(), 0x11111110, 1);
__ Mov(x13, -1);
__ Clasta(x13, p1, x13, z0.VnS());
__ Clastb(x14, p2, x14, z0.VnS());
__ Clasta(x18, p4, x18, z0.VnS());
__ Index(z0.VnD(), 0x1111111111111110, 1);
__ Mov(x19, -1);
__ Clasta(x19, p1, x19, z0.VnD());
__ Clastb(x20, p2, x20, z0.VnD());
__ Clasta(x21, p4, x21, z0.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_64(0x00000000000000ff, x1);
ASSERT_EQUAL_64(0x00000000000000ff, x2);
ASSERT_EQUAL_64(0x0000000000000011, x3);
ASSERT_EQUAL_64(0x0000000000000010, x4);
ASSERT_EQUAL_64(0x0000000000000019, x5);
ASSERT_EQUAL_64(0x0000000000000018, x6);
ASSERT_EQUAL_64(0x0000000000000010, x7);
ASSERT_EQUAL_64(0x000000000000ffff, x9);
ASSERT_EQUAL_64(0x0000000000001110, x12);
ASSERT_EQUAL_64(0x00000000ffffffff, x13);
ASSERT_EQUAL_64(0x0000000011111110, x14);
ASSERT_EQUAL_64(0x0000000011111110, x18);
ASSERT_EQUAL_64(0xffffffffffffffff, x19);
ASSERT_EQUAL_64(0x1111111111111110, x20);
ASSERT_EQUAL_64(0x1111111111111110, x21);
int vl = core.GetSVELaneCount(kBRegSize) * 8;
switch (vl) {
case 128:
ASSERT_EQUAL_64(0x0000000000001116, x10);
break;
case 512:
ASSERT_EQUAL_64(0x0000000000001118, x10);
break;
case 2048:
ASSERT_EQUAL_64(0x0000000000001118, x10);
break;
default:
printf("WARNING: Some tests skipped due to unexpected VL.\n");
break;
}
}
}
TEST_SVE(sve_clast_v) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
START();
__ Pfalse(p1.VnB());
int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
Initialise(&masm, p2.VnB(), p2_inputs);
Initialise(&masm, p3.VnB(), p3_inputs);
__ Ptrue(p4.VnB());
__ Index(z0.VnB(), 0x10, 1);
__ Dup(z1.VnB(), -1);
__ Dup(z2.VnB(), -1);
__ Clasta(b1, p1, b1, z0.VnB());
__ Clastb(b2, p1, b2, z0.VnB());
__ Clasta(b3, p2, b3, z0.VnB());
__ Clastb(b4, p2, b4, z0.VnB());
__ Clasta(b5, p3, b5, z0.VnB());
__ Clastb(b6, p3, b6, z0.VnB());
__ Clasta(b7, p4, b7, z0.VnB());
__ Punpklo(p3.VnH(), p3.VnB());
__ Index(z0.VnH(), 0x1110, 1);
__ Dup(z9.VnB(), -1);
__ Clasta(h9, p1, h9, z0.VnH());
__ Clastb(h10, p3, h10, z0.VnH());
__ Clasta(h12, p4, h12, z0.VnH());
__ Index(z0.VnS(), 0x11111110, 1);
__ Dup(z13.VnB(), -1);
__ Clasta(s13, p1, s13, z0.VnS());
__ Clastb(s14, p2, s14, z0.VnS());
__ Clasta(s18, p4, s18, z0.VnS());
__ Index(z0.VnD(), 0x1111111111111110, 1);
__ Dup(z19.VnB(), -1);
__ Clasta(d19, p1, d19, z0.VnD());
__ Clastb(d20, p2, d20, z0.VnD());
__ Clasta(d21, p4, d21, z0.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_128(0, 0x00000000000000ff, q1);
ASSERT_EQUAL_128(0, 0x00000000000000ff, q2);
ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
ASSERT_EQUAL_128(0, 0x000000000000ffff, q9);
ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
ASSERT_EQUAL_128(0, 0x00000000ffffffff, q13);
ASSERT_EQUAL_128(0, 0x0000000011111110, q14);
ASSERT_EQUAL_128(0, 0x0000000011111110, q18);
ASSERT_EQUAL_128(0, 0xffffffffffffffff, q19);
ASSERT_EQUAL_128(0, 0x1111111111111110, q20);
ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
int vl = core.GetSVELaneCount(kBRegSize) * 8;
switch (vl) {
case 128:
ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
break;
case 512:
ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
break;
case 2048:
ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
break;
default:
printf("WARNING: Some tests skipped due to unexpected VL.\n");
break;
}
}
}
TEST_SVE(sve_clast_z) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
START();
__ Pfalse(p1.VnB());
int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
Initialise(&masm, p2.VnB(), p2_inputs);
Initialise(&masm, p3.VnB(), p3_inputs);
__ Ptrue(p4.VnB());
__ Index(z0.VnB(), 0x10, 1);
__ Dup(z1.VnB(), 0xff);
__ Dup(z2.VnB(), 0xff);
__ Clasta(z1.VnB(), p1, z1.VnB(), z0.VnB());
__ Clastb(z2.VnB(), p1, z2.VnB(), z0.VnB());
__ Clasta(z3.VnB(), p2, z3.VnB(), z0.VnB());
__ Clastb(z4.VnB(), p2, z4.VnB(), z0.VnB());
__ Clasta(z5.VnB(), p3, z5.VnB(), z0.VnB());
__ Clastb(z6.VnB(), p3, z6.VnB(), z0.VnB());
__ Clasta(z7.VnB(), p4, z7.VnB(), z0.VnB());
__ Punpklo(p3.VnH(), p3.VnB());
__ Index(z0.VnH(), 0x1110, 1);
__ Dup(z9.VnB(), 0xff);
__ Clasta(z9.VnH(), p1, z9.VnH(), z0.VnH());
__ Clastb(z10.VnH(), p3, z10.VnH(), z0.VnH());
__ Clasta(z12.VnH(), p4, z12.VnH(), z0.VnH());
__ Index(z0.VnS(), 0x11111110, 1);
__ Dup(z13.VnB(), 0xff);
__ Clasta(z13.VnS(), p1, z13.VnS(), z0.VnS());
__ Clastb(z14.VnS(), p2, z14.VnS(), z0.VnS());
__ Clasta(z16.VnS(), p4, z16.VnS(), z0.VnS());
__ Index(z0.VnD(), 0x1111111111111110, 1);
__ Dup(z17.VnB(), 0xff);
__ Clasta(z17.VnD(), p1, z17.VnD(), z0.VnD());
__ Clastb(z18.VnD(), p2, z18.VnD(), z0.VnD());
__ Clasta(z20.VnD(), p4, z20.VnD(), z0.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t z1_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
uint64_t z2_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
uint64_t z3_expected[] = {0x1111111111111111, 0x1111111111111111};
uint64_t z4_expected[] = {0x1010101010101010, 0x1010101010101010};
uint64_t z5_expected[] = {0x1919191919191919, 0x1919191919191919};
uint64_t z6_expected[] = {0x1818181818181818, 0x1818181818181818};
uint64_t z7_expected[] = {0x1010101010101010, 0x1010101010101010};
uint64_t z9_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
uint64_t z12_expected[] = {0x1110111011101110, 0x1110111011101110};
uint64_t z13_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
uint64_t z14_expected[] = {0x1111111011111110, 0x1111111011111110};
uint64_t z16_expected[] = {0x1111111011111110, 0x1111111011111110};
uint64_t z17_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
uint64_t z18_expected[] = {0x1111111111111110, 0x1111111111111110};
uint64_t z20_expected[] = {0x1111111111111110, 0x1111111111111110};
uint64_t z10_expected_vl128[] = {0x1116111611161116, 0x1116111611161116};
uint64_t z10_expected_vl_long[] = {0x1118111811181118, 0x1118111811181118};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
ASSERT_EQUAL_SVE(z12_expected, z12.VnD());
ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
ASSERT_EQUAL_SVE(z14_expected, z14.VnD());
ASSERT_EQUAL_SVE(z16_expected, z16.VnD());
ASSERT_EQUAL_SVE(z17_expected, z17.VnD());
ASSERT_EQUAL_SVE(z18_expected, z18.VnD());
ASSERT_EQUAL_SVE(z20_expected, z20.VnD());
int vl = core.GetSVELaneCount(kBRegSize) * 8;
switch (vl) {
case 128:
ASSERT_EQUAL_SVE(z10_expected_vl128, z10.VnD());
break;
case 512:
case 2048:
ASSERT_EQUAL_SVE(z10_expected_vl_long, z10.VnD());
break;
default:
printf("WARNING: Some tests skipped due to unexpected VL.\n");
break;
}
}
}
TEST_SVE(sve_compact) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
START();
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
__ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
__ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
__ Zip1(p4.VnD(), p0.VnD(), p1.VnD());
__ Index(z0.VnS(), 0x11111111, 0x11111111);
__ Mov(q0, q0);
__ Compact(z1.VnS(), p0, z0.VnS());
__ Compact(z2.VnS(), p2, z0.VnS());
__ Compact(z0.VnS(), p3, z0.VnS());
__ Index(z3.VnD(), 0x1111111111111111, 0x1111111111111111);
__ Mov(q3, q3);
__ Compact(z4.VnD(), p0, z3.VnD());
__ Compact(z5.VnD(), p1, z3.VnD());
__ Compact(z6.VnD(), p4, z3.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t z1_expected[] = {0x4444444433333333, 0x2222222211111111};
uint64_t z2_expected[] = {0x0000000000000000, 0x3333333311111111};
uint64_t z0_expected[] = {0x0000000000000000, 0x4444444422222222};
uint64_t z4_expected[] = {0x2222222222222222, 0x1111111111111111};
uint64_t z5_expected[] = {0x0000000000000000, 0x0000000000000000};
uint64_t z6_expected[] = {0x0000000000000000, 0x1111111111111111};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
}
}
TEST_SVE(sve_splice) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
int p2b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int p3b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
int p4b_inputs[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int p5b_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0};
int p6b_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0};
Initialise(&masm, p2.VnB(), p2b_inputs);
Initialise(&masm, p3.VnB(), p3b_inputs);
Initialise(&masm, p4.VnB(), p4b_inputs);
Initialise(&masm, p5.VnB(), p5b_inputs);
Initialise(&masm, p6.VnB(), p6b_inputs);
__ Index(z30.VnB(), 1, 1);
__ Index(z0.VnB(), -1, -1);
__ Splice(z0.VnB(), p0, z0.VnB(), z30.VnB());
__ Index(z1.VnB(), -1, -1);
__ Splice(z1.VnB(), p1, z1.VnB(), z30.VnB());
__ Index(z2.VnB(), -1, -1);
__ Splice(z2.VnB(), p2, z2.VnB(), z30.VnB());
__ Index(z3.VnB(), -1, -1);
__ Splice(z3.VnB(), p3, z3.VnB(), z30.VnB());
__ Index(z4.VnB(), -1, -1);
__ Splice(z4.VnB(), p4, z4.VnB(), z30.VnB());
__ Index(z5.VnB(), -1, -1);
__ Splice(z5.VnB(), p5, z5.VnB(), z30.VnB());
__ Index(z6.VnB(), -1, -1);
__ Splice(z6.VnB(), p6, z6.VnB(), z30.VnB());
int p2h_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0};
int p3h_inputs[] = {0, 0, 1, 0, 0, 0, 1, 0};
Initialise(&masm, p2.VnH(), p2h_inputs);
Initialise(&masm, p3.VnH(), p3h_inputs);
__ Index(z30.VnH(), 1, 1);
__ Index(z29.VnH(), -1, -1);
__ Splice(z7.VnH(), p2, z29.VnH(), z30.VnH());
__ Splice(z8.VnH(), p3, z29.VnH(), z30.VnH());
int p2s_inputs[] = {0, 0, 1, 0};
int p3s_inputs[] = {1, 0, 1, 0};
Initialise(&masm, p2.VnS(), p2s_inputs);
Initialise(&masm, p3.VnS(), p3s_inputs);
__ Index(z30.VnS(), 1, 1);
__ Index(z29.VnS(), -1, -1);
__ Splice(z9.VnS(), p2, z29.VnS(), z30.VnS());
__ Splice(z10.VnS(), p3, z29.VnS(), z30.VnS());
int p2d_inputs[] = {0, 1};
int p3d_inputs[] = {1, 0};
Initialise(&masm, p2.VnD(), p2d_inputs);
Initialise(&masm, p3.VnD(), p3d_inputs);
__ Index(z30.VnD(), 1, 1);
__ Index(z29.VnD(), -1, -1);
__ Splice(z11.VnD(), p2, z29.VnD(), z30.VnD());
__ Splice(z30.VnD(), p3, z29.VnD(), z30.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t z0_expected[] = {0xf0f1f2f3f4f5f6f7, 0xf8f9fafbfcfdfeff};
uint64_t z1_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
uint64_t z2_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201ff};
uint64_t z3_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201fe};
uint64_t z4_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201f0};
uint64_t z5_expected[] = {0x0c0b0a0908070605, 0x04030201f6f7f8f9};
uint64_t z6_expected[] = {0x01f0f1f2f3f4f5f6, 0xf7f8f9fafbfcfdfe};
uint64_t z7_expected[] = {0x0007000600050004, 0x000300020001fffe};
uint64_t z8_expected[] = {0x000300020001fffa, 0xfffbfffcfffdfffe};
uint64_t z9_expected[] = {0x0000000300000002, 0x00000001fffffffe};
uint64_t z10_expected[] = {0x00000001fffffffc, 0xfffffffdfffffffe};
uint64_t z11_expected[] = {0x0000000000000001, 0xffffffffffffffff};
uint64_t z30_expected[] = {0x0000000000000001, 0xfffffffffffffffe};
ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
ASSERT_EQUAL_SVE(z10_expected, z10.VnD());
ASSERT_EQUAL_SVE(z11_expected, z11.VnD());
ASSERT_EQUAL_SVE(z30_expected, z30.VnD());
}
}
TEST_SVE(sve_predicate_logical) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// 0b...01011010'10110111
int p10_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1}; // Pm
// 0b...11011001'01010010
int p11_inputs[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0}; // Pn
// 0b...01010101'10110010
int p12_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0}; // pg
Initialise(&masm, p10.VnB(), p10_inputs);
Initialise(&masm, p11.VnB(), p11_inputs);
Initialise(&masm, p12.VnB(), p12_inputs);
__ Ands(p0.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
__ Mrs(x0, NZCV);
__ Bics(p1.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
__ Mrs(x1, NZCV);
__ Eor(p2.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
__ Nand(p3.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
__ Nor(p4.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
__ Orn(p5.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
__ Orr(p6.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
__ Sel(p7.VnB(), p12, p11.VnB(), p10.VnB());
END();
if (CAN_RUN()) {
RUN();
// 0b...01010000'00010010
int p0_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
// 0b...00000001'00000000
int p1_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0};
// 0b...00000001'10100000
int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
// 0b...00000101'10100000
int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
// 0b...00000100'00000000
int p4_expected[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// 0b...01010101'00010010
int p5_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
// 0b...01010001'10110010
int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
// 0b...01011011'00010111
int p7_expected[] = {0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1};
ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
ASSERT_EQUAL_32(SVEFirstFlag, w0);
ASSERT_EQUAL_32(SVENotLastFlag, w1);
}
}
TEST_SVE(sve_int_compare_vectors) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int z10_inputs[] = {0x00, 0x80, 0xff, 0x7f, 0x00, 0x00, 0x00, 0xff};
int z11_inputs[] = {0x00, 0x00, 0x00, 0x00, 0x80, 0xff, 0x7f, 0xfe};
int p0_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
InsrHelper(&masm, z10.VnB(), z10_inputs);
InsrHelper(&masm, z11.VnB(), z11_inputs);
Initialise(&masm, p0.VnB(), p0_inputs);
__ Cmphs(p6.VnB(), p0.Zeroing(), z10.VnB(), z11.VnB());
__ Mrs(x6, NZCV);
uint64_t z12_inputs[] = {0xffffffffffffffff, 0x8000000000000000};
uint64_t z13_inputs[] = {0x0000000000000000, 0x8000000000000000};
int p1_inputs[] = {1, 1};
InsrHelper(&masm, z12.VnD(), z12_inputs);
InsrHelper(&masm, z13.VnD(), z13_inputs);
Initialise(&masm, p1.VnD(), p1_inputs);
__ Cmphi(p7.VnD(), p1.Zeroing(), z12.VnD(), z13.VnD());
__ Mrs(x7, NZCV);
int z14_inputs[] = {0, 32767, -1, -32767, 0, 0, 0, 32766};
int z15_inputs[] = {0, 0, 0, 0, 32767, -1, -32767, 32767};
int p2_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
InsrHelper(&masm, z14.VnH(), z14_inputs);
InsrHelper(&masm, z15.VnH(), z15_inputs);
Initialise(&masm, p2.VnH(), p2_inputs);
__ Cmpge(p8.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
__ Mrs(x8, NZCV);
__ Cmpeq(p9.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
__ Mrs(x9, NZCV);
int z16_inputs[] = {0, -1, 0, 0};
int z17_inputs[] = {0, 0, 2147483647, -2147483648};
int p3_inputs[] = {1, 1, 1, 1};
InsrHelper(&masm, z16.VnS(), z16_inputs);
InsrHelper(&masm, z17.VnS(), z17_inputs);
Initialise(&masm, p3.VnS(), p3_inputs);
__ Cmpgt(p10.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
__ Mrs(x10, NZCV);
__ Cmpne(p11.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
__ Mrs(x11, NZCV);
// Architectural aliases testing.
__ Cmpls(p12.VnB(), p0.Zeroing(), z11.VnB(), z10.VnB()); // HS
__ Cmplo(p13.VnD(), p1.Zeroing(), z13.VnD(), z12.VnD()); // HI
__ Cmple(p14.VnH(), p2.Zeroing(), z15.VnH(), z14.VnH()); // GE
__ Cmplt(p15.VnS(), p3.Zeroing(), z17.VnS(), z16.VnS()); // GT
END();
if (CAN_RUN()) {
RUN();
int p6_expected[] = {1, 0, 1, 1, 0, 0, 0, 1};
for (size_t i = 0; i < ArrayLength(p6_expected); i++) {
int lane = static_cast<int>(ArrayLength(p6_expected) - i - 1);
ASSERT_EQUAL_SVE_LANE(p6_expected[i], p6.VnB(), lane);
}
int p7_expected[] = {1, 0};
ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
int p8_expected[] = {1, 0, 0, 0, 0, 1, 1, 0};
ASSERT_EQUAL_SVE(p8_expected, p8.VnH());
int p9_expected[] = {1, 0, 0, 0, 0, 0, 0, 0};
ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
int p10_expected[] = {0, 0, 0, 1};
ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
int p11_expected[] = {0, 1, 1, 1};
ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
// Reuse the expected results to verify the architectural aliases.
ASSERT_EQUAL_SVE(p6_expected, p12.VnB());
ASSERT_EQUAL_SVE(p7_expected, p13.VnD());
ASSERT_EQUAL_SVE(p8_expected, p14.VnH());
ASSERT_EQUAL_SVE(p10_expected, p15.VnS());
ASSERT_EQUAL_32(SVEFirstFlag, w6);
ASSERT_EQUAL_32(NoFlag, w7);
ASSERT_EQUAL_32(NoFlag, w8);
ASSERT_EQUAL_32(NoFlag, w9);
ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
}
}
TEST_SVE(sve_int_compare_vectors_wide_elements) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int src1_inputs_1[] = {0, 1, -1, -128, 127, 100, -66};
int src2_inputs_1[] = {0, -1};
int mask_inputs_1[] = {1, 1, 1, 1, 1, 0, 1};
InsrHelper(&masm, z13.VnB(), src1_inputs_1);
InsrHelper(&masm, z19.VnD(), src2_inputs_1);
Initialise(&masm, p0.VnB(), mask_inputs_1);
__ Cmpge(p2.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
__ Mrs(x2, NZCV);
__ Cmpgt(p3.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
__ Mrs(x3, NZCV);
int src1_inputs_2[] = {0, 32767, -1, -32767, 1, 1234, 0, 32766};
int src2_inputs_2[] = {0, -32767};
int mask_inputs_2[] = {1, 0, 1, 1, 1, 1, 1, 1};
InsrHelper(&masm, z13.VnH(), src1_inputs_2);
InsrHelper(&masm, z19.VnD(), src2_inputs_2);
Initialise(&masm, p0.VnH(), mask_inputs_2);
__ Cmple(p4.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
__ Mrs(x4, NZCV);
__ Cmplt(p5.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
__ Mrs(x5, NZCV);
int src1_inputs_3[] = {0, -1, 2147483647, -2147483648};
int src2_inputs_3[] = {0, -2147483648};
int mask_inputs_3[] = {1, 1, 1, 1};
InsrHelper(&masm, z13.VnS(), src1_inputs_3);
InsrHelper(&masm, z19.VnD(), src2_inputs_3);
Initialise(&masm, p0.VnS(), mask_inputs_3);
__ Cmpeq(p6.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
__ Mrs(x6, NZCV);
__ Cmpne(p7.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
__ Mrs(x7, NZCV);
int src1_inputs_4[] = {0x00, 0x80, 0x7f, 0xff, 0x7f, 0xf0, 0x0f, 0x55};
int src2_inputs_4[] = {0x00, 0x7f};
int mask_inputs_4[] = {1, 1, 1, 1, 0, 1, 1, 1};
InsrHelper(&masm, z13.VnB(), src1_inputs_4);
InsrHelper(&masm, z19.VnD(), src2_inputs_4);
Initialise(&masm, p0.VnB(), mask_inputs_4);
__ Cmplo(p8.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
__ Mrs(x8, NZCV);
__ Cmpls(p9.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
__ Mrs(x9, NZCV);
int src1_inputs_5[] = {0x0000, 0x8000, 0x7fff, 0xffff};
int src2_inputs_5[] = {0x8000, 0xffff};
int mask_inputs_5[] = {1, 1, 1, 1};
InsrHelper(&masm, z13.VnS(), src1_inputs_5);
InsrHelper(&masm, z19.VnD(), src2_inputs_5);
Initialise(&masm, p0.VnS(), mask_inputs_5);
__ Cmphi(p10.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
__ Mrs(x10, NZCV);
__ Cmphs(p11.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
__ Mrs(x11, NZCV);
END();
if (CAN_RUN()) {
RUN();
int p2_expected[] = {1, 1, 1, 0, 1, 0, 0};
ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
int p3_expected[] = {1, 1, 0, 0, 1, 0, 0};
ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
int p4_expected[] = {0x1, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
int p5_expected[] = {0x0, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
int p6_expected[] = {0x1, 0x0, 0x0, 0x1};
ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
int p8_expected[] = {1, 0, 0, 0, 0, 0, 1, 1};
ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
int p9_expected[] = {1, 0, 1, 0, 0, 0, 1, 1};
ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
int p10_expected[] = {0x0, 0x0, 0x0, 0x0};
ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
int p11_expected[] = {0x0, 0x1, 0x0, 0x1};
ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
ASSERT_EQUAL_32(NoFlag, w2);
ASSERT_EQUAL_32(NoFlag, w3);
ASSERT_EQUAL_32(NoFlag, w4);
ASSERT_EQUAL_32(SVENotLastFlag, w5);
ASSERT_EQUAL_32(SVEFirstFlag, w6);
ASSERT_EQUAL_32(SVENotLastFlag, w7);
ASSERT_EQUAL_32(SVEFirstFlag, w8);
ASSERT_EQUAL_32(SVEFirstFlag, w9);
ASSERT_EQUAL_32(SVENotLastFlag | SVENoneFlag, w10);
ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w11);
}
}
TEST_SVE(sve_bitwise_imm) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// clang-format off
uint64_t z21_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
uint32_t z22_inputs[] = {0xfedcba98, 0x76543210, 0x01234567, 0x89abcdef};
uint16_t z23_inputs[] = {0xfedc, 0xba98, 0x7654, 0x3210,
0x0123, 0x4567, 0x89ab, 0xcdef};
uint8_t z24_inputs[] = {0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
// clang-format on
InsrHelper(&masm, z1.VnD(), z21_inputs);
InsrHelper(&masm, z2.VnS(), z22_inputs);
InsrHelper(&masm, z3.VnH(), z23_inputs);
InsrHelper(&masm, z4.VnB(), z24_inputs);
__ And(z1.VnD(), z1.VnD(), 0x0000ffff0000ffff);
__ And(z2.VnS(), z2.VnS(), 0xff0000ff);
__ And(z3.VnH(), z3.VnH(), 0x0ff0);
__ And(z4.VnB(), z4.VnB(), 0x3f);
InsrHelper(&masm, z5.VnD(), z21_inputs);
InsrHelper(&masm, z6.VnS(), z22_inputs);
InsrHelper(&masm, z7.VnH(), z23_inputs);
InsrHelper(&masm, z8.VnB(), z24_inputs);
__ Eor(z5.VnD(), z5.VnD(), 0x0000ffff0000ffff);
__ Eor(z6.VnS(), z6.VnS(), 0xff0000ff);
__ Eor(z7.VnH(), z7.VnH(), 0x0ff0);
__ Eor(z8.VnB(), z8.VnB(), 0x3f);
InsrHelper(&masm, z9.VnD(), z21_inputs);
InsrHelper(&masm, z10.VnS(), z22_inputs);
InsrHelper(&masm, z11.VnH(), z23_inputs);
InsrHelper(&masm, z12.VnB(), z24_inputs);
__ Orr(z9.VnD(), z9.VnD(), 0x0000ffff0000ffff);
__ Orr(z10.VnS(), z10.VnS(), 0xff0000ff);
__ Orr(z11.VnH(), z11.VnH(), 0x0ff0);
__ Orr(z12.VnB(), z12.VnB(), 0x3f);
{
// The `Dup` macro maps onto either `dup` or `dupm`, but has its own test,
// so here we test `dupm` directly.
ExactAssemblyScope guard(&masm, 4 * kInstructionSize);
__ dupm(z13.VnD(), 0x7ffffff800000000);
__ dupm(z14.VnS(), 0x7ffc7ffc);
__ dupm(z15.VnH(), 0x3ffc);
__ dupm(z16.VnB(), 0xc3);
}
END();
if (CAN_RUN()) {
RUN();
// clang-format off
uint64_t z1_expected[] = {0x0000ba9800003210, 0x000045670000cdef};
uint32_t z2_expected[] = {0xfe000098, 0x76000010, 0x01000067, 0x890000ef};
uint16_t z3_expected[] = {0x0ed0, 0x0a90, 0x0650, 0x0210,
0x0120, 0x0560, 0x09a0, 0x0de0};
uint8_t z4_expected[] = {0x3e, 0x1c, 0x3a, 0x18, 0x36, 0x14, 0x32, 0x10,
0x01, 0x23, 0x05, 0x27, 0x09, 0x2b, 0x0d, 0x2f};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
ASSERT_EQUAL_SVE(z4_expected, z4.VnB());
uint64_t z5_expected[] = {0xfedc45677654cdef, 0x0123ba9889ab3210};
uint32_t z6_expected[] = {0x01dcba67, 0x895432ef, 0xfe234598, 0x76abcd10};
uint16_t z7_expected[] = {0xf12c, 0xb568, 0x79a4, 0x3de0,
0x0ed3, 0x4a97, 0x865b, 0xc21f};
uint8_t z8_expected[] = {0xc1, 0xe3, 0x85, 0xa7, 0x49, 0x6b, 0x0d, 0x2f,
0x3e, 0x1c, 0x7a, 0x58, 0xb6, 0x94, 0xf2, 0xd0};
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
ASSERT_EQUAL_SVE(z8_expected, z8.VnB());
uint64_t z9_expected[] = {0xfedcffff7654ffff, 0x0123ffff89abffff};
uint32_t z10_expected[] = {0xffdcbaff, 0xff5432ff, 0xff2345ff, 0xffabcdff};
uint16_t z11_expected[] = {0xfffc, 0xbff8, 0x7ff4, 0x3ff0,
0x0ff3, 0x4ff7, 0x8ffb, 0xcfff};
uint8_t z12_expected[] = {0xff, 0xff, 0xbf, 0xbf, 0x7f, 0x7f, 0x3f, 0x3f,
0x3f, 0x3f, 0x7f, 0x7f, 0xbf, 0xbf, 0xff, 0xff};
ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
ASSERT_EQUAL_SVE(z10_expected, z10.VnS());
ASSERT_EQUAL_SVE(z11_expected, z11.VnH());
ASSERT_EQUAL_SVE(z12_expected, z12.VnB());
uint64_t z13_expected[] = {0x7ffffff800000000, 0x7ffffff800000000};
uint32_t z14_expected[] = {0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc};
uint16_t z15_expected[] = {0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc,
0x3ffc, 0x3ffc, 0x3ffc ,0x3ffc};
ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
ASSERT_EQUAL_SVE(z14_expected, z14.VnS());
ASSERT_EQUAL_SVE(z15_expected, z15.VnH());
// clang-format on
}
}
TEST_SVE(sve_dup_imm) {
// The `Dup` macro can generate `dup`, `dupm`, and it can synthesise
// unencodable immediates.
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Encodable with `dup` (shift 0).
__ Dup(z0.VnD(), -1);
__ Dup(z1.VnS(), 0x7f);
__ Dup(z2.VnH(), -0x80);
__ Dup(z3.VnB(), 42);
// Encodable with `dup` (shift 8).
__ Dup(z4.VnD(), -42 * 256);
__ Dup(z5.VnS(), -0x8000);
__ Dup(z6.VnH(), 0x7f00);
// B-sized lanes cannot take a shift of 8.
// Encodable with `dupm` (but not `dup`).
__ Dup(z10.VnD(), 0x3fc);
__ Dup(z11.VnS(), -516097); // 0xfff81fff, as a signed int.
__ Dup(z12.VnH(), 0x0001);
// All values that fit B-sized lanes are encodable with `dup`.
// Cases that require immediate synthesis.
__ Dup(z20.VnD(), 0x1234);
__ Dup(z21.VnD(), -4242);
__ Dup(z22.VnD(), 0xfedcba9876543210);
__ Dup(z23.VnS(), 0x01020304);
__ Dup(z24.VnS(), -0x01020304);
__ Dup(z25.VnH(), 0x3c38);
// All values that fit B-sized lanes are directly encodable.
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(0xffffffffffffffff, z0.VnD());
ASSERT_EQUAL_SVE(0x0000007f, z1.VnS());
ASSERT_EQUAL_SVE(0xff80, z2.VnH());
ASSERT_EQUAL_SVE(0x2a, z3.VnB());
ASSERT_EQUAL_SVE(0xffffffffffffd600, z4.VnD());
ASSERT_EQUAL_SVE(0xffff8000, z5.VnS());
ASSERT_EQUAL_SVE(0x7f00, z6.VnH());
ASSERT_EQUAL_SVE(0x00000000000003fc, z10.VnD());
ASSERT_EQUAL_SVE(0xfff81fff, z11.VnS());
ASSERT_EQUAL_SVE(0x0001, z12.VnH());
ASSERT_EQUAL_SVE(0x1234, z20.VnD());
ASSERT_EQUAL_SVE(0xffffffffffffef6e, z21.VnD());
ASSERT_EQUAL_SVE(0xfedcba9876543210, z22.VnD());
ASSERT_EQUAL_SVE(0x01020304, z23.VnS());
ASSERT_EQUAL_SVE(0xfefdfcfc, z24.VnS());
ASSERT_EQUAL_SVE(0x3c38, z25.VnH());
}
}
TEST_SVE(sve_inc_dec_p_scalar) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
Initialise(&masm, p0.VnB(), p0_inputs);
int p0_b_count = 9;
int p0_h_count = 5;
int p0_s_count = 3;
int p0_d_count = 2;
// 64-bit operations preserve their high bits.
__ Mov(x0, 0x123456780000002a);
__ Decp(x0, p0.VnB());
__ Mov(x1, 0x123456780000002a);
__ Incp(x1, p0.VnH());
// Check that saturation does not occur.
__ Mov(x10, 1);
__ Decp(x10, p0.VnS());
__ Mov(x11, UINT64_MAX);
__ Incp(x11, p0.VnD());
__ Mov(x12, INT64_MAX);
__ Incp(x12, p0.VnB());
// With an all-true predicate, these instructions increment or decrement by
// the vector length.
__ Ptrue(p15.VnB());
__ Mov(x20, 0x4000000000000000);
__ Decp(x20, p15.VnB());
__ Mov(x21, 0x4000000000000000);
__ Incp(x21, p15.VnH());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_64(0x123456780000002a - p0_b_count, x0);
ASSERT_EQUAL_64(0x123456780000002a + p0_h_count, x1);
ASSERT_EQUAL_64(UINT64_C(1) - p0_s_count, x10);
ASSERT_EQUAL_64(UINT64_MAX + p0_d_count, x11);
ASSERT_EQUAL_64(static_cast<uint64_t>(INT64_MAX) + p0_b_count, x12);
ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
}
}
TEST_SVE(sve_sqinc_sqdec_p_scalar) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
Initialise(&masm, p0.VnB(), p0_inputs);
int p0_b_count = 9;
int p0_h_count = 5;
int p0_s_count = 3;
int p0_d_count = 2;
uint64_t placeholder_high = 0x1234567800000000;
// 64-bit operations preserve their high bits.
__ Mov(x0, placeholder_high + 42);
__ Sqdecp(x0, p0.VnB());
__ Mov(x1, placeholder_high + 42);
__ Sqincp(x1, p0.VnH());
// 32-bit operations sign-extend into their high bits.
__ Mov(x2, placeholder_high + 42);
__ Sqdecp(x2, p0.VnS(), w2);
__ Mov(x3, placeholder_high + 42);
__ Sqincp(x3, p0.VnD(), w3);
__ Mov(x4, placeholder_high + 1);
__ Sqdecp(x4, p0.VnS(), w4);
__ Mov(x5, placeholder_high - 1);
__ Sqincp(x5, p0.VnD(), w5);
// Check that saturation behaves correctly.
__ Mov(x10, 0x8000000000000001); // INT64_MIN + 1
__ Sqdecp(x10, p0.VnB());
__ Mov(x11, placeholder_high + 0x80000001); // INT32_MIN + 1
__ Sqdecp(x11, p0.VnH(), w11);
__ Mov(x12, 1);
__ Sqdecp(x12, p0.VnS());
__ Mov(x13, placeholder_high + 1);
__ Sqdecp(x13, p0.VnD(), w13);
__ Mov(x14, 0x7ffffffffffffffe); // INT64_MAX - 1
__ Sqincp(x14, p0.VnB());
__ Mov(x15, placeholder_high + 0x7ffffffe); // INT32_MAX - 1
__ Sqincp(x15, p0.VnH(), w15);
// Don't use x16 and x17 since they are scratch registers by default.
__ Mov(x18, 0xffffffffffffffff);
__ Sqincp(x18, p0.VnS());
__ Mov(x19, placeholder_high + 0xffffffff);
__ Sqincp(x19, p0.VnD(), w19);
__ Mov(x20, placeholder_high + 0xffffffff);
__ Sqdecp(x20, p0.VnB(), w20);
// With an all-true predicate, these instructions increment or decrement by
// the vector length.
__ Ptrue(p15.VnB());
__ Mov(x21, 0);
__ Sqdecp(x21, p15.VnB());
__ Mov(x22, 0);
__ Sqincp(x22, p15.VnH());
__ Mov(x23, placeholder_high);
__ Sqdecp(x23, p15.VnS(), w23);
__ Mov(x24, placeholder_high);
__ Sqincp(x24, p15.VnD(), w24);
END();
if (CAN_RUN()) {
RUN();
// 64-bit operations preserve their high bits.
ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);
// 32-bit operations sign-extend into their high bits.
ASSERT_EQUAL_64(42 - p0_s_count, x2);
ASSERT_EQUAL_64(42 + p0_d_count, x3);
ASSERT_EQUAL_64(0xffffffff00000000 | (1 - p0_s_count), x4);
ASSERT_EQUAL_64(p0_d_count - 1, x5);
// Check that saturation behaves correctly.
ASSERT_EQUAL_64(INT64_MIN, x10);
ASSERT_EQUAL_64(INT32_MIN, x11);
ASSERT_EQUAL_64(1 - p0_s_count, x12);
ASSERT_EQUAL_64(1 - p0_d_count, x13);
ASSERT_EQUAL_64(INT64_MAX, x14);
ASSERT_EQUAL_64(INT32_MAX, x15);
ASSERT_EQUAL_64(p0_s_count - 1, x18);
ASSERT_EQUAL_64(p0_d_count - 1, x19);
ASSERT_EQUAL_64(-1 - p0_b_count, x20);
// Check all-true predicates.
ASSERT_EQUAL_64(-core.GetSVELaneCount(kBRegSize), x21);
ASSERT_EQUAL_64(core.GetSVELaneCount(kHRegSize), x22);
ASSERT_EQUAL_64(-core.GetSVELaneCount(kSRegSize), x23);
ASSERT_EQUAL_64(core.GetSVELaneCount(kDRegSize), x24);
}
}
TEST_SVE(sve_uqinc_uqdec_p_scalar) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
Initialise(&masm, p0.VnB(), p0_inputs);
int p0_b_count = 9;
int p0_h_count = 5;
int p0_s_count = 3;
int p0_d_count = 2;
uint64_t placeholder_high = 0x1234567800000000;
// 64-bit operations preserve their high bits.
__ Mov(x0, placeholder_high + 42);
__ Uqdecp(x0, p0.VnB());
__ Mov(x1, placeholder_high + 42);
__ Uqincp(x1, p0.VnH());
// 32-bit operations zero-extend into their high bits.
__ Mov(x2, placeholder_high + 42);
__ Uqdecp(x2, p0.VnS(), w2);
__ Mov(x3, placeholder_high + 42);
__ Uqincp(x3, p0.VnD(), w3);
__ Mov(x4, placeholder_high + 0x80000001);
__ Uqdecp(x4, p0.VnS(), w4);
__ Mov(x5, placeholder_high + 0x7fffffff);
__ Uqincp(x5, p0.VnD(), w5);
// Check that saturation behaves correctly.
__ Mov(x10, 1);
__ Uqdecp(x10, p0.VnB(), x10);
__ Mov(x11, placeholder_high + 1);
__ Uqdecp(x11, p0.VnH(), w11);
__ Mov(x12, 0x8000000000000000); // INT64_MAX + 1
__ Uqdecp(x12, p0.VnS(), x12);
__ Mov(x13, placeholder_high + 0x80000000); // INT32_MAX + 1
__ Uqdecp(x13, p0.VnD(), w13);
__ Mov(x14, 0xfffffffffffffffe); // UINT64_MAX - 1
__ Uqincp(x14, p0.VnB(), x14);
__ Mov(x15, placeholder_high + 0xfffffffe); // UINT32_MAX - 1
__ Uqincp(x15, p0.VnH(), w15);
// Don't use x16 and x17 since they are scratch registers by default.
__ Mov(x18, 0x7ffffffffffffffe); // INT64_MAX - 1
__ Uqincp(x18, p0.VnS(), x18);
__ Mov(x19, placeholder_high + 0x7ffffffe); // INT32_MAX - 1
__ Uqincp(x19, p0.VnD(), w19);
// With an all-true predicate, these instructions increment or decrement by
// the vector length.
__ Ptrue(p15.VnB());
__ Mov(x20, 0x4000000000000000);
__ Uqdecp(x20, p15.VnB(), x20);
__ Mov(x21, 0x4000000000000000);
__ Uqincp(x21, p15.VnH(), x21);
__ Mov(x22, placeholder_high + 0x40000000);
__ Uqdecp(x22, p15.VnS(), w22);
__ Mov(x23, placeholder_high + 0x40000000);
__ Uqincp(x23, p15.VnD(), w23);
END();
if (CAN_RUN()) {
RUN();
// 64-bit operations preserve their high bits.
ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);
// 32-bit operations zero-extend into their high bits.
ASSERT_EQUAL_64(42 - p0_s_count, x2);
ASSERT_EQUAL_64(42 + p0_d_count, x3);
ASSERT_EQUAL_64(UINT64_C(0x80000001) - p0_s_count, x4);
ASSERT_EQUAL_64(UINT64_C(0x7fffffff) + p0_d_count, x5);
// Check that saturation behaves correctly.
ASSERT_EQUAL_64(0, x10);
ASSERT_EQUAL_64(0, x11);
ASSERT_EQUAL_64(0x8000000000000000 - p0_s_count, x12);
ASSERT_EQUAL_64(UINT64_C(0x80000000) - p0_d_count, x13);
ASSERT_EQUAL_64(UINT64_MAX, x14);
ASSERT_EQUAL_64(UINT32_MAX, x15);
ASSERT_EQUAL_64(0x7ffffffffffffffe + p0_s_count, x18);
ASSERT_EQUAL_64(UINT64_C(0x7ffffffe) + p0_d_count, x19);
// Check all-true predicates.
ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
ASSERT_EQUAL_64(0x40000000 - core.GetSVELaneCount(kSRegSize), x22);
ASSERT_EQUAL_64(0x40000000 + core.GetSVELaneCount(kDRegSize), x23);
}
}
TEST_SVE(sve_inc_dec_p_vector) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
Initialise(&masm, p0.VnB(), p0_inputs);
// Check that saturation does not occur.
int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
InsrHelper(&masm, z0.VnD(), z0_inputs);
int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
InsrHelper(&masm, z1.VnD(), z1_inputs);
int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
InsrHelper(&masm, z2.VnS(), z2_inputs);
int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
InsrHelper(&masm, z3.VnH(), z3_inputs);
// The MacroAssembler implements non-destructive operations using movprfx.
__ Decp(z10.VnD(), p0, z0.VnD());
__ Decp(z11.VnD(), p0, z1.VnD());
__ Decp(z12.VnS(), p0, z2.VnS());
__ Decp(z13.VnH(), p0, z3.VnH());
__ Incp(z14.VnD(), p0, z0.VnD());
__ Incp(z15.VnD(), p0, z1.VnD());
__ Incp(z16.VnS(), p0, z2.VnS());
__ Incp(z17.VnH(), p0, z3.VnH());
// Also test destructive forms.
__ Mov(z4, z0);
__ Mov(z5, z1);
__ Mov(z6, z2);
__ Mov(z7, z3);
__ Decp(z0.VnD(), p0);
__ Decp(z1.VnD(), p0);
__ Decp(z2.VnS(), p0);
__ Decp(z3.VnH(), p0);
__ Incp(z4.VnD(), p0);
__ Incp(z5.VnD(), p0);
__ Incp(z6.VnS(), p0);
__ Incp(z7.VnH(), p0);
END();
if (CAN_RUN()) {
RUN();
// z0_inputs[...] - number of active D lanes (2)
int64_t z0_expected[] = {0x1234567800000040, -2, -1, 0x7ffffffffffffffe};
ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
// z1_inputs[...] - number of active D lanes (2)
int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
// z2_inputs[...] - number of active S lanes (3)
int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, 0x7ffffffd};
ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
// z3_inputs[...] - number of active H lanes (5)
int16_t z3_expected[] = {0x1225, -5, -4, -6, 0x7ffb, 0x7ffa};
ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
// z0_inputs[...] + number of active D lanes (2)
uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
// z1_inputs[...] + number of active D lanes (2)
uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, 0x8000000000000001};
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
// z2_inputs[...] + number of active S lanes (3)
uint32_t z6_expected[] = {0x12340045, 3, 2, 4, 0x80000002, 0x80000003};
ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
// z3_inputs[...] + number of active H lanes (5)
uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, 0x8004};
ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
// Check that the non-destructive macros produced the same results.
ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
}
}
TEST_SVE(sve_inc_dec_ptrue_vector) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// With an all-true predicate, these instructions increment or decrement by
// the vector length.
__ Ptrue(p15.VnB());
__ Dup(z0.VnD(), 0);
__ Decp(z0.VnD(), p15);
__ Dup(z1.VnS(), 0);
__ Decp(z1.VnS(), p15);
__ Dup(z2.VnH(), 0);
__ Decp(z2.VnH(), p15);
__ Dup(z3.VnD(), 0);
__ Incp(z3.VnD(), p15);
__ Dup(z4.VnS(), 0);
__ Incp(z4.VnS(), p15);
__ Dup(z5.VnH(), 0);
__ Incp(z5.VnH(), p15);
END();
if (CAN_RUN()) {
RUN();
int d_lane_count = core.GetSVELaneCount(kDRegSize);
int s_lane_count = core.GetSVELaneCount(kSRegSize);
int h_lane_count = core.GetSVELaneCount(kHRegSize);
for (int i = 0; i < d_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
}
for (int i = 0; i < s_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
}
for (int i = 0; i < h_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
}
}
}
TEST_SVE(sve_sqinc_sqdec_p_vector) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
Initialise(&masm, p0.VnB(), p0_inputs);
// Check that saturation behaves correctly.
int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
InsrHelper(&masm, z0.VnD(), z0_inputs);
int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
InsrHelper(&masm, z1.VnD(), z1_inputs);
int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
InsrHelper(&masm, z2.VnS(), z2_inputs);
int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
InsrHelper(&masm, z3.VnH(), z3_inputs);
// The MacroAssembler implements non-destructive operations using movprfx.
__ Sqdecp(z10.VnD(), p0, z0.VnD());
__ Sqdecp(z11.VnD(), p0, z1.VnD());
__ Sqdecp(z12.VnS(), p0, z2.VnS());
__ Sqdecp(z13.VnH(), p0, z3.VnH());
__ Sqincp(z14.VnD(), p0, z0.VnD());
__ Sqincp(z15.VnD(), p0, z1.VnD());
__ Sqincp(z16.VnS(), p0, z2.VnS());
__ Sqincp(z17.VnH(), p0, z3.VnH());
// Also test destructive forms.
__ Mov(z4, z0);
__ Mov(z5, z1);
__ Mov(z6, z2);
__ Mov(z7, z3);
__ Sqdecp(z0.VnD(), p0);
__ Sqdecp(z1.VnD(), p0);
__ Sqdecp(z2.VnS(), p0);
__ Sqdecp(z3.VnH(), p0);
__ Sqincp(z4.VnD(), p0);
__ Sqincp(z5.VnD(), p0);
__ Sqincp(z6.VnS(), p0);
__ Sqincp(z7.VnH(), p0);
END();
if (CAN_RUN()) {
RUN();
// z0_inputs[...] - number of active D lanes (2)
int64_t z0_expected[] = {0x1234567800000040, -2, -1, INT64_MIN};
ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
// z1_inputs[...] - number of active D lanes (2)
int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
// z2_inputs[...] - number of active S lanes (3)
int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, INT32_MIN};
ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
// z3_inputs[...] - number of active H lanes (5)
int16_t z3_expected[] = {0x1225, -5, -4, -6, INT16_MIN, 0x7ffa};
ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
// z0_inputs[...] + number of active D lanes (2)
uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
// z1_inputs[...] + number of active D lanes (2)
uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, INT64_MAX};
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
// z2_inputs[...] + number of active S lanes (3)
uint32_t z6_expected[] = {0x12340045, 3, 2, 4, INT32_MAX, 0x80000003};
ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
// z3_inputs[...] + number of active H lanes (5)
uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, INT16_MAX};
ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
// Check that the non-destructive macros produced the same results.
ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
}
}
TEST_SVE(sve_sqinc_sqdec_ptrue_vector) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// With an all-true predicate, these instructions increment or decrement by
// the vector length.
__ Ptrue(p15.VnB());
__ Dup(z0.VnD(), 0);
__ Sqdecp(z0.VnD(), p15);
__ Dup(z1.VnS(), 0);
__ Sqdecp(z1.VnS(), p15);
__ Dup(z2.VnH(), 0);
__ Sqdecp(z2.VnH(), p15);
__ Dup(z3.VnD(), 0);
__ Sqincp(z3.VnD(), p15);
__ Dup(z4.VnS(), 0);
__ Sqincp(z4.VnS(), p15);
__ Dup(z5.VnH(), 0);
__ Sqincp(z5.VnH(), p15);
END();
if (CAN_RUN()) {
RUN();
int d_lane_count = core.GetSVELaneCount(kDRegSize);
int s_lane_count = core.GetSVELaneCount(kSRegSize);
int h_lane_count = core.GetSVELaneCount(kHRegSize);
for (int i = 0; i < d_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
}
for (int i = 0; i < s_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
}
for (int i = 0; i < h_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
}
}
}
TEST_SVE(sve_uqinc_uqdec_p_vector) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
Initialise(&masm, p0.VnB(), p0_inputs);
// Check that saturation behaves correctly.
uint64_t z0_inputs[] = {0x1234567800000042, 0, 1, 0x8000000000000000};
InsrHelper(&masm, z0.VnD(), z0_inputs);
uint64_t z1_inputs[] = {0x12345678ffffff2a, 0, UINT64_MAX, INT64_MAX};
InsrHelper(&masm, z1.VnD(), z1_inputs);
uint32_t z2_inputs[] = {0x12340042, 0, UINT32_MAX, 1, INT32_MAX, 0x80000000};
InsrHelper(&masm, z2.VnS(), z2_inputs);
uint16_t z3_inputs[] = {0x122a, 0, 1, UINT16_MAX, 0x8000, INT16_MAX};
InsrHelper(&masm, z3.VnH(), z3_inputs);
// The MacroAssembler implements non-destructive operations using movprfx.
__ Uqdecp(z10.VnD(), p0, z0.VnD());
__ Uqdecp(z11.VnD(), p0, z1.VnD());
__ Uqdecp(z12.VnS(), p0, z2.VnS());
__ Uqdecp(z13.VnH(), p0, z3.VnH());
__ Uqincp(z14.VnD(), p0, z0.VnD());
__ Uqincp(z15.VnD(), p0, z1.VnD());
__ Uqincp(z16.VnS(), p0, z2.VnS());
__ Uqincp(z17.VnH(), p0, z3.VnH());
// Also test destructive forms.
__ Mov(z4, z0);
__ Mov(z5, z1);
__ Mov(z6, z2);
__ Mov(z7, z3);
__ Uqdecp(z0.VnD(), p0);
__ Uqdecp(z1.VnD(), p0);
__ Uqdecp(z2.VnS(), p0);
__ Uqdecp(z3.VnH(), p0);
__ Uqincp(z4.VnD(), p0);
__ Uqincp(z5.VnD(), p0);
__ Uqincp(z6.VnS(), p0);
__ Uqincp(z7.VnH(), p0);
END();
if (CAN_RUN()) {
RUN();
// z0_inputs[...] - number of active D lanes (2)
uint64_t z0_expected[] = {0x1234567800000040, 0, 0, 0x7ffffffffffffffe};
ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
// z1_inputs[...] - number of active D lanes (2)
uint64_t z1_expected[] = {0x12345678ffffff28,
0,
0xfffffffffffffffd,
0x7ffffffffffffffd};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
// z2_inputs[...] - number of active S lanes (3)
uint32_t z2_expected[] =
{0x1234003f, 0, 0xfffffffc, 0, 0x7ffffffc, 0x7ffffffd};
ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
// z3_inputs[...] - number of active H lanes (5)
uint16_t z3_expected[] = {0x1225, 0, 0, 0xfffa, 0x7ffb, 0x7ffa};
ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
// z0_inputs[...] + number of active D lanes (2)
uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
// z1_inputs[...] + number of active D lanes (2)
uint64_t z5_expected[] = {0x12345678ffffff2c,
2,
UINT64_MAX,
0x8000000000000001};
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
// z2_inputs[...] + number of active S lanes (3)
uint32_t z6_expected[] =
{0x12340045, 3, UINT32_MAX, 4, 0x80000002, 0x80000003};
ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
// z3_inputs[...] + number of active H lanes (5)
uint16_t z7_expected[] = {0x122f, 5, 6, UINT16_MAX, 0x8005, 0x8004};
ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
// Check that the non-destructive macros produced the same results.
ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
}
}
TEST_SVE(sve_uqinc_uqdec_ptrue_vector) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// With an all-true predicate, these instructions increment or decrement by
// the vector length.
__ Ptrue(p15.VnB());
__ Mov(x0, 0x1234567800000000);
__ Mov(x1, 0x12340000);
__ Mov(x2, 0x1200);
__ Dup(z0.VnD(), x0);
__ Uqdecp(z0.VnD(), p15);
__ Dup(z1.VnS(), x1);
__ Uqdecp(z1.VnS(), p15);
__ Dup(z2.VnH(), x2);
__ Uqdecp(z2.VnH(), p15);
__ Dup(z3.VnD(), x0);
__ Uqincp(z3.VnD(), p15);
__ Dup(z4.VnS(), x1);
__ Uqincp(z4.VnS(), p15);
__ Dup(z5.VnH(), x2);
__ Uqincp(z5.VnH(), p15);
END();
if (CAN_RUN()) {
RUN();
int d_lane_count = core.GetSVELaneCount(kDRegSize);
int s_lane_count = core.GetSVELaneCount(kSRegSize);
int h_lane_count = core.GetSVELaneCount(kHRegSize);
for (int i = 0; i < d_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(0x1234567800000000 - d_lane_count, z0.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0x1234567800000000 + d_lane_count, z3.VnD(), i);
}
for (int i = 0; i < s_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(0x12340000 - s_lane_count, z1.VnS(), i);
ASSERT_EQUAL_SVE_LANE(0x12340000 + s_lane_count, z4.VnS(), i);
}
for (int i = 0; i < h_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(0x1200 - h_lane_count, z2.VnH(), i);
ASSERT_EQUAL_SVE_LANE(0x1200 + h_lane_count, z5.VnH(), i);
}
}
}
TEST_SVE(sve_index) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Simple cases.
__ Index(z0.VnB(), 0, 1);
__ Index(z1.VnH(), 1, 1);
__ Index(z2.VnS(), 2, 1);
__ Index(z3.VnD(), 3, 1);
// Synthesised immediates.
__ Index(z4.VnB(), 42, -1);
__ Index(z5.VnH(), -1, 42);
__ Index(z6.VnS(), 42, 42);
// Register arguments.
__ Mov(x0, 42);
__ Mov(x1, -3);
__ Index(z10.VnD(), x0, x1);
__ Index(z11.VnB(), w0, w1);
// The register size should correspond to the lane size, but VIXL allows any
// register at least as big as the lane size.
__ Index(z12.VnB(), x0, x1);
__ Index(z13.VnH(), w0, x1);
__ Index(z14.VnS(), x0, w1);
// Integer overflow.
__ Index(z20.VnB(), UINT8_MAX - 2, 2);
__ Index(z21.VnH(), 7, -3);
__ Index(z22.VnS(), INT32_MAX - 2, 1);
__ Index(z23.VnD(), INT64_MIN + 6, -7);
END();
if (CAN_RUN()) {
RUN();
int b_lane_count = core.GetSVELaneCount(kBRegSize);
int h_lane_count = core.GetSVELaneCount(kHRegSize);
int s_lane_count = core.GetSVELaneCount(kSRegSize);
int d_lane_count = core.GetSVELaneCount(kDRegSize);
uint64_t b_mask = GetUintMask(kBRegSize);
uint64_t h_mask = GetUintMask(kHRegSize);
uint64_t s_mask = GetUintMask(kSRegSize);
uint64_t d_mask = GetUintMask(kDRegSize);
// Simple cases.
for (int i = 0; i < b_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((0 + i) & b_mask, z0.VnB(), i);
}
for (int i = 0; i < h_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((1 + i) & h_mask, z1.VnH(), i);
}
for (int i = 0; i < s_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((2 + i) & s_mask, z2.VnS(), i);
}
for (int i = 0; i < d_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((3 + i) & d_mask, z3.VnD(), i);
}
// Synthesised immediates.
for (int i = 0; i < b_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((42 - i) & b_mask, z4.VnB(), i);
}
for (int i = 0; i < h_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((-1 + (42 * i)) & h_mask, z5.VnH(), i);
}
for (int i = 0; i < s_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((42 + (42 * i)) & s_mask, z6.VnS(), i);
}
// Register arguments.
for (int i = 0; i < d_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & d_mask, z10.VnD(), i);
}
for (int i = 0; i < b_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z11.VnB(), i);
}
for (int i = 0; i < b_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z12.VnB(), i);
}
for (int i = 0; i < h_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & h_mask, z13.VnH(), i);
}
for (int i = 0; i < s_lane_count; i++) {
ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & s_mask, z14.VnS(), i);
}
// Integer overflow.
uint8_t expected_z20[] = {0x05, 0x03, 0x01, 0xff, 0xfd};
ASSERT_EQUAL_SVE(expected_z20, z20.VnB());
uint16_t expected_z21[] = {0xfffb, 0xfffe, 0x0001, 0x0004, 0x0007};
ASSERT_EQUAL_SVE(expected_z21, z21.VnH());
uint32_t expected_z22[] = {0x80000000, 0x7fffffff, 0x7ffffffe, 0x7ffffffd};
ASSERT_EQUAL_SVE(expected_z22, z22.VnS());
uint64_t expected_z23[] = {0x7fffffffffffffff, 0x8000000000000006};
ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
}
}
TEST(sve_int_compare_count_and_limit_scalars) {
SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Mov(w20, 0xfffffffd);
__ Mov(w21, 0xffffffff);
__ Whilele(p0.VnB(), w20, w21);
__ Mrs(x0, NZCV);
__ Whilele(p1.VnH(), w20, w21);
__ Mrs(x1, NZCV);
__ Mov(w20, 0xffffffff);
__ Mov(w21, 0x00000000);
__ Whilelt(p2.VnS(), w20, w21);
__ Mrs(x2, NZCV);
__ Whilelt(p3.VnD(), w20, w21);
__ Mrs(x3, NZCV);
__ Mov(w20, 0xfffffffd);
__ Mov(w21, 0xffffffff);
__ Whilels(p4.VnB(), w20, w21);
__ Mrs(x4, NZCV);
__ Whilels(p5.VnH(), w20, w21);
__ Mrs(x5, NZCV);
__ Mov(w20, 0xffffffff);
__ Mov(w21, 0x00000000);
__ Whilelo(p6.VnS(), w20, w21);
__ Mrs(x6, NZCV);
__ Whilelo(p7.VnD(), w20, w21);
__ Mrs(x7, NZCV);
__ Mov(x20, 0xfffffffffffffffd);
__ Mov(x21, 0xffffffffffffffff);
__ Whilele(p8.VnB(), x20, x21);
__ Mrs(x8, NZCV);
__ Whilele(p9.VnH(), x20, x21);
__ Mrs(x9, NZCV);
__ Mov(x20, 0xffffffffffffffff);
__ Mov(x21, 0x0000000000000000);
__ Whilelt(p10.VnS(), x20, x21);
__ Mrs(x10, NZCV);
__ Whilelt(p11.VnD(), x20, x21);
__ Mrs(x11, NZCV);
__ Mov(x20, 0xfffffffffffffffd);
__ Mov(x21, 0xffffffffffffffff);
__ Whilels(p12.VnB(), x20, x21);
__ Mrs(x12, NZCV);
__ Whilels(p13.VnH(), x20, x21);
__ Mrs(x13, NZCV);
__ Mov(x20, 0xffffffffffffffff);
__ Mov(x21, 0x0000000000000000);
__ Whilelo(p14.VnS(), x20, x21);
__ Mrs(x14, NZCV);
__ Whilelo(p15.VnD(), x20, x21);
__ Mrs(x15, NZCV);
END();
if (CAN_RUN()) {
RUN();
// 0b...00000000'00000111
int p0_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
// 0b...00000000'00010101
int p1_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
int p2_expected[] = {0x0, 0x0, 0x0, 0x1};
ASSERT_EQUAL_SVE(p2_expected, p2.VnS());
int p3_expected[] = {0x00, 0x01};
ASSERT_EQUAL_SVE(p3_expected, p3.VnD());
// 0b...11111111'11111111
int p4_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
// 0b...01010101'01010101
int p5_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
int p6_expected[] = {0x0, 0x0, 0x0, 0x0};
ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
int p7_expected[] = {0x00, 0x00};
ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
// 0b...00000000'00000111
int p8_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
// 0b...00000000'00010101
int p9_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
int p10_expected[] = {0x0, 0x0, 0x0, 0x1};
ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
int p11_expected[] = {0x00, 0x01};
ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
// 0b...11111111'11111111
int p12_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
// 0b...01010101'01010101
int p13_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p13_expected, p13.VnH());
int p14_expected[] = {0x0, 0x0, 0x0, 0x0};
ASSERT_EQUAL_SVE(p14_expected, p14.VnS());
int p15_expected[] = {0x00, 0x00};
ASSERT_EQUAL_SVE(p15_expected, p15.VnD());
ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w0);
ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w1);
ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w2);
ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w3);
ASSERT_EQUAL_32(SVEFirstFlag, w4);
ASSERT_EQUAL_32(SVEFirstFlag, w5);
ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w6);
ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w7);
ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w8);
ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w9);
ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w11);
ASSERT_EQUAL_32(SVEFirstFlag, w12);
ASSERT_EQUAL_32(SVEFirstFlag, w13);
ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w14);
ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w15);
}
}
TEST(sve_int_compare_count_and_limit_scalars_regression_test) {
SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Mov(w0, 0x7ffffffd);
__ Mov(w1, 0x7fffffff);
__ Whilele(p0.VnB(), w0, w1);
END();
if (CAN_RUN()) {
RUN();
int p0_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
}
}
TEST(sve_int_compare_vectors_signed_imm) {
SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int z13_inputs[] = {0, 1, -1, -15, 126, -127, -126, -15};
int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 1, 1};
InsrHelper(&masm, z13.VnB(), z13_inputs);
Initialise(&masm, p0.VnB(), mask_inputs1);
__ Cmpeq(p2.VnB(), p0.Zeroing(), z13.VnB(), -15);
__ Mrs(x2, NZCV);
__ Cmpeq(p3.VnB(), p0.Zeroing(), z13.VnB(), -127);
int z14_inputs[] = {0, 1, -1, -32767, -32766, 32767, 32766, 0};
int mask_inputs2[] = {1, 1, 1, 0, 1, 1, 1, 1};
InsrHelper(&masm, z14.VnH(), z14_inputs);
Initialise(&masm, p0.VnH(), mask_inputs2);
__ Cmpge(p4.VnH(), p0.Zeroing(), z14.VnH(), -1);
__ Mrs(x4, NZCV);
__ Cmpge(p5.VnH(), p0.Zeroing(), z14.VnH(), -32767);
int z15_inputs[] = {0, 1, -1, INT_MIN};
int mask_inputs3[] = {0, 1, 1, 1};
InsrHelper(&masm, z15.VnS(), z15_inputs);
Initialise(&masm, p0.VnS(), mask_inputs3);
__ Cmpgt(p6.VnS(), p0.Zeroing(), z15.VnS(), 0);
__ Mrs(x6, NZCV);
__ Cmpgt(p7.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
__ Cmplt(p8.VnS(), p0.Zeroing(), z15.VnS(), 0);
__ Mrs(x8, NZCV);
__ Cmplt(p9.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
int64_t z16_inputs[] = {0, -1};
int mask_inputs4[] = {1, 1};
InsrHelper(&masm, z16.VnD(), z16_inputs);
Initialise(&masm, p0.VnD(), mask_inputs4);
__ Cmple(p10.VnD(), p0.Zeroing(), z16.VnD(), -1);
__ Mrs(x10, NZCV);
__ Cmple(p11.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MIN);
__ Cmpne(p12.VnD(), p0.Zeroing(), z16.VnD(), -1);
__ Mrs(x12, NZCV);
__ Cmpne(p13.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MAX);
END();
if (CAN_RUN()) {
RUN();
int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1};
ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 0};
ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1, 0x1};
ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1};
ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
int p6_expected[] = {0x0, 0x1, 0x0, 0x0};
ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
int p8_expected[] = {0x0, 0x0, 0x1, 0x1};
ASSERT_EQUAL_SVE(p8_expected, p8.VnS());
int p9_expected[] = {0x0, 0x0, 0x0, 0x1};
ASSERT_EQUAL_SVE(p9_expected, p9.VnS());
int p10_expected[] = {0x00, 0x01};
ASSERT_EQUAL_SVE(p10_expected, p10.VnD());
int p11_expected[] = {0x00, 0x00};
ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
int p12_expected[] = {0x01, 0x00};
ASSERT_EQUAL_SVE(p12_expected, p12.VnD());
int p13_expected[] = {0x01, 0x01};
ASSERT_EQUAL_SVE(p13_expected, p13.VnD());
ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w2);
ASSERT_EQUAL_32(SVEFirstFlag, w4);
ASSERT_EQUAL_32(NoFlag, w6);
ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w10);
ASSERT_EQUAL_32(NoFlag, w12);
}
}
TEST(sve_int_compare_vectors_unsigned_imm) {
SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint32_t src1_inputs[] = {0xf7, 0x0f, 0x8f, 0x1f, 0x83, 0x12, 0x00, 0xf1};
int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 0, 1};
InsrHelper(&masm, z13.VnB(), src1_inputs);
Initialise(&masm, p0.VnB(), mask_inputs1);
__ Cmphi(p2.VnB(), p0.Zeroing(), z13.VnB(), 0x0f);
__ Mrs(x2, NZCV);
__ Cmphi(p3.VnB(), p0.Zeroing(), z13.VnB(), 0xf0);
uint32_t src2_inputs[] = {0xffff, 0x8000, 0x1fff, 0x0000, 0x1234};
int mask_inputs2[] = {1, 1, 1, 1, 0};
InsrHelper(&masm, z13.VnH(), src2_inputs);
Initialise(&masm, p0.VnH(), mask_inputs2);
__ Cmphs(p4.VnH(), p0.Zeroing(), z13.VnH(), 0x1f);
__ Mrs(x4, NZCV);
__ Cmphs(p5.VnH(), p0.Zeroing(), z13.VnH(), 0x1fff);
uint32_t src3_inputs[] = {0xffffffff, 0xfedcba98, 0x0000ffff, 0x00000000};
int mask_inputs3[] = {1, 1, 1, 1};
InsrHelper(&masm, z13.VnS(), src3_inputs);
Initialise(&masm, p0.VnS(), mask_inputs3);
__ Cmplo(p6.VnS(), p0.Zeroing(), z13.VnS(), 0x3f);
__ Mrs(x6, NZCV);
__ Cmplo(p7.VnS(), p0.Zeroing(), z13.VnS(), 0x3f3f3f3f);
uint64_t src4_inputs[] = {0xffffffffffffffff, 0x0000000000000000};
int mask_inputs4[] = {1, 1};
InsrHelper(&masm, z13.VnD(), src4_inputs);
Initialise(&masm, p0.VnD(), mask_inputs4);
__ Cmpls(p8.VnD(), p0.Zeroing(), z13.VnD(), 0x2f);
__ Mrs(x8, NZCV);
__ Cmpls(p9.VnD(), p0.Zeroing(), z13.VnD(), 0x800000000000000);
END();
if (CAN_RUN()) {
RUN();
int p2_expected[] = {1, 0, 1, 0, 1, 1, 0, 1};
ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
int p3_expected[] = {1, 0, 0, 0, 0, 0, 0, 1};
ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
int p6_expected[] = {0x0, 0x0, 0x0, 0x1};
ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
int p7_expected[] = {0x0, 0x0, 0x1, 0x1};
ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
int p8_expected[] = {0x00, 0x01};
ASSERT_EQUAL_SVE(p8_expected, p8.VnD());
int p9_expected[] = {0x00, 0x01};
ASSERT_EQUAL_SVE(p9_expected, p9.VnD());
ASSERT_EQUAL_32(SVEFirstFlag, w2);
ASSERT_EQUAL_32(NoFlag, w4);
ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w6);
ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
}
}
TEST(sve_int_compare_conditionally_terminate_scalars) {
SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Mov(x0, 0xfedcba9887654321);
__ Mov(x1, 0x1000100010001000);
// Initialise Z and C. These are preserved by cterm*, and the V flag is set to
// !C if the condition does not hold.
__ Mov(x10, NoFlag);
__ Msr(NZCV, x10);
__ Ctermeq(w0, w0);
__ Mrs(x2, NZCV);
__ Ctermeq(x0, x1);
__ Mrs(x3, NZCV);
__ Ctermne(x0, x0);
__ Mrs(x4, NZCV);
__ Ctermne(w0, w1);
__ Mrs(x5, NZCV);
// As above, but with all flags initially set.
__ Mov(x10, NZCVFlag);
__ Msr(NZCV, x10);
__ Ctermeq(w0, w0);
__ Mrs(x6, NZCV);
__ Ctermeq(x0, x1);
__ Mrs(x7, NZCV);
__ Ctermne(x0, x0);
__ Mrs(x8, NZCV);
__ Ctermne(w0, w1);
__ Mrs(x9, NZCV);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_32(SVEFirstFlag, w2);
ASSERT_EQUAL_32(VFlag, w3);
ASSERT_EQUAL_32(VFlag, w4);
ASSERT_EQUAL_32(SVEFirstFlag, w5);
ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w6);
ASSERT_EQUAL_32(ZCFlag, w7);
ASSERT_EQUAL_32(ZCFlag, w8);
ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w9);
}
}
// Work out what the architectural `PredTest` pseudocode should produce for the
// given result and governing predicate.
template <typename Tg, typename Td, int N>
static StatusFlags GetPredTestFlags(const Td (&pd)[N],
const Tg (&pg)[N],
int vl) {
int first = -1;
int last = -1;
bool any_active = false;
// Only consider potentially-active lanes.
int start = (N > vl) ? (N - vl) : 0;
for (int i = start; i < N; i++) {
if ((pg[i] & 1) == 1) {
// Look for the first and last active lanes.
// Note that the 'first' lane is the one with the highest index.
if (last < 0) last = i;
first = i;
// Look for any active lanes that are also active in pd.
if ((pd[i] & 1) == 1) any_active = true;
}
}
uint32_t flags = 0;
if ((first >= 0) && ((pd[first] & 1) == 1)) flags |= SVEFirstFlag;
if (!any_active) flags |= SVENoneFlag;
if ((last < 0) || ((pd[last] & 1) == 0)) flags |= SVENotLastFlag;
return static_cast<StatusFlags>(flags);
}
typedef void (MacroAssembler::*PfirstPnextFn)(const PRegisterWithLaneSize& pd,
const PRegister& pg,
const PRegisterWithLaneSize& pn);
template <typename Tg, typename Tn, typename Td>
static void PfirstPnextHelper(Test* config,
PfirstPnextFn macro,
unsigned lane_size_in_bits,
const Tg& pg_inputs,
const Tn& pn_inputs,
const Td& pd_expected) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
PRegister pg = p15;
PRegister pn = p14;
Initialise(&masm, pg.WithLaneSize(lane_size_in_bits), pg_inputs);
Initialise(&masm, pn.WithLaneSize(lane_size_in_bits), pn_inputs);
// Initialise NZCV to an impossible value, to check that we actually write it.
__ Mov(x10, NZCVFlag);
// If pd.Is(pn), the MacroAssembler simply passes the arguments directly to
// the Assembler.
__ Msr(NZCV, x10);
__ Mov(p0, pn);
(masm.*macro)(p0.WithLaneSize(lane_size_in_bits),
pg,
p0.WithLaneSize(lane_size_in_bits));
__ Mrs(x0, NZCV);
// The MacroAssembler supports non-destructive use.
__ Msr(NZCV, x10);
(masm.*macro)(p1.WithLaneSize(lane_size_in_bits),
pg,
pn.WithLaneSize(lane_size_in_bits));
__ Mrs(x1, NZCV);
// If pd.Aliases(pg) the macro requires a scratch register.
{
UseScratchRegisterScope temps(&masm);
temps.Include(p13);
__ Msr(NZCV, x10);
__ Mov(p2, p15);
(masm.*macro)(p2.WithLaneSize(lane_size_in_bits),
p2,
pn.WithLaneSize(lane_size_in_bits));
__ Mrs(x2, NZCV);
}
END();
if (CAN_RUN()) {
RUN();
// Check that the inputs weren't modified.
ASSERT_EQUAL_SVE(pn_inputs, pn.WithLaneSize(lane_size_in_bits));
ASSERT_EQUAL_SVE(pg_inputs, pg.WithLaneSize(lane_size_in_bits));
// Check the primary operation.
ASSERT_EQUAL_SVE(pd_expected, p0.WithLaneSize(lane_size_in_bits));
ASSERT_EQUAL_SVE(pd_expected, p1.WithLaneSize(lane_size_in_bits));
ASSERT_EQUAL_SVE(pd_expected, p2.WithLaneSize(lane_size_in_bits));
// Check that the flags were properly set.
StatusFlags nzcv_expected =
GetPredTestFlags(pd_expected,
pg_inputs,
core.GetSVELaneCount(kBRegSize));
ASSERT_EQUAL_64(nzcv_expected, x0);
ASSERT_EQUAL_64(nzcv_expected, x1);
ASSERT_EQUAL_64(nzcv_expected, x2);
}
}
template <typename Tg, typename Tn, typename Td>
static void PfirstHelper(Test* config,
const Tg& pg_inputs,
const Tn& pn_inputs,
const Td& pd_expected) {
PfirstPnextHelper(config,
&MacroAssembler::Pfirst,
kBRegSize, // pfirst only accepts B-sized lanes.
pg_inputs,
pn_inputs,
pd_expected);
}
template <typename Tg, typename Tn, typename Td>
static void PnextHelper(Test* config,
unsigned lane_size_in_bits,
const Tg& pg_inputs,
const Tn& pn_inputs,
const Td& pd_expected) {
PfirstPnextHelper(config,
&MacroAssembler::Pnext,
lane_size_in_bits,
pg_inputs,
pn_inputs,
pd_expected);
}
TEST_SVE(sve_pfirst) {
// Provide more lanes than kPRegMinSize (to check propagation if we have a
// large VL), but few enough to make the test easy to read.
int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int in1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
int in2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
int in3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
VIXL_ASSERT(ArrayLength(in0) > kPRegMinSize);
// Pfirst finds the first active lane in pg, and activates the corresponding
// lane in pn (if it isn't already active).
// The first active lane in in1 is here. |
// v
int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
int exp12[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0};
int exp13[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
int exp14[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
PfirstHelper(config, in1, in0, exp10);
PfirstHelper(config, in1, in2, exp12);
PfirstHelper(config, in1, in3, exp13);
PfirstHelper(config, in1, in4, exp14);
// The first active lane in in2 is here. |
// v
int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
int exp21[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0};
int exp23[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
int exp24[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
PfirstHelper(config, in2, in0, exp20);
PfirstHelper(config, in2, in1, exp21);
PfirstHelper(config, in2, in3, exp23);
PfirstHelper(config, in2, in4, exp24);
// The first active lane in in3 is here. |
// v
int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int exp31[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
int exp32[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
int exp34[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
PfirstHelper(config, in3, in0, exp30);
PfirstHelper(config, in3, in1, exp31);
PfirstHelper(config, in3, in2, exp32);
PfirstHelper(config, in3, in4, exp34);
// | The first active lane in in4 is here.
// v
int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp41[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
int exp42[] = {1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
int exp43[] = {1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
PfirstHelper(config, in4, in0, exp40);
PfirstHelper(config, in4, in1, exp41);
PfirstHelper(config, in4, in2, exp42);
PfirstHelper(config, in4, in3, exp43);
// If pg is all inactive, the input is passed through unchanged.
PfirstHelper(config, in0, in0, in0);
PfirstHelper(config, in0, in1, in1);
PfirstHelper(config, in0, in2, in2);
PfirstHelper(config, in0, in3, in3);
// If the values of pg and pn match, the value is passed through unchanged.
PfirstHelper(config, in0, in0, in0);
PfirstHelper(config, in1, in1, in1);
PfirstHelper(config, in2, in2, in2);
PfirstHelper(config, in3, in3, in3);
}
TEST_SVE(sve_pfirst_alias) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Check that the Simulator behaves correctly when all arguments are aliased.
int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
int in_s[] = {0, 1, 1, 0};
int in_d[] = {1, 1};
Initialise(&masm, p0.VnB(), in_b);
Initialise(&masm, p1.VnH(), in_h);
Initialise(&masm, p2.VnS(), in_s);
Initialise(&masm, p3.VnD(), in_d);
// Initialise NZCV to an impossible value, to check that we actually write it.
__ Mov(x10, NZCVFlag);
__ Msr(NZCV, x10);
__ Pfirst(p0.VnB(), p0, p0.VnB());
__ Mrs(x0, NZCV);
__ Msr(NZCV, x10);
__ Pfirst(p1.VnB(), p1, p1.VnB());
__ Mrs(x1, NZCV);
__ Msr(NZCV, x10);
__ Pfirst(p2.VnB(), p2, p2.VnB());
__ Mrs(x2, NZCV);
__ Msr(NZCV, x10);
__ Pfirst(p3.VnB(), p3, p3.VnB());
__ Mrs(x3, NZCV);
END();
if (CAN_RUN()) {
RUN();
// The first lane from pg is already active in pdn, so the P register should
// be unchanged.
ASSERT_EQUAL_SVE(in_b, p0.VnB());
ASSERT_EQUAL_SVE(in_h, p1.VnH());
ASSERT_EQUAL_SVE(in_s, p2.VnS());
ASSERT_EQUAL_SVE(in_d, p3.VnD());
ASSERT_EQUAL_64(SVEFirstFlag, x0);
ASSERT_EQUAL_64(SVEFirstFlag, x1);
ASSERT_EQUAL_64(SVEFirstFlag, x2);
ASSERT_EQUAL_64(SVEFirstFlag, x3);
}
}
TEST_SVE(sve_pnext_b) {
// TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
// (to check propagation if we have a large VL), but few enough to make the
// test easy to read.
// For now, we just use kPRegMinSize so that the test works anywhere.
int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int in1[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
int in2[] = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
int in3[] = {0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1};
int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// Pnext activates the next element that is true in pg, after the last-active
// element in pn. If all pn elements are false (as in in0), it starts looking
// at element 0.
// There are no active lanes in in0, so the result is simply the first active
// lane from pg.
int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// The last active lane in in1 is here. |
// v
int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp21[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp31[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// | The last active lane in in2 is here.
// v
int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// | The last active lane in in3 is here.
// v
int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp23[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// | The last active lane in in4 is here.
// v
int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
PnextHelper(config, kBRegSize, in0, in0, exp00);
PnextHelper(config, kBRegSize, in1, in0, exp10);
PnextHelper(config, kBRegSize, in2, in0, exp20);
PnextHelper(config, kBRegSize, in3, in0, exp30);
PnextHelper(config, kBRegSize, in4, in0, exp40);
PnextHelper(config, kBRegSize, in0, in1, exp01);
PnextHelper(config, kBRegSize, in1, in1, exp11);
PnextHelper(config, kBRegSize, in2, in1, exp21);
PnextHelper(config, kBRegSize, in3, in1, exp31);
PnextHelper(config, kBRegSize, in4, in1, exp41);
PnextHelper(config, kBRegSize, in0, in2, exp02);
PnextHelper(config, kBRegSize, in1, in2, exp12);
PnextHelper(config, kBRegSize, in2, in2, exp22);
PnextHelper(config, kBRegSize, in3, in2, exp32);
PnextHelper(config, kBRegSize, in4, in2, exp42);
PnextHelper(config, kBRegSize, in0, in3, exp03);
PnextHelper(config, kBRegSize, in1, in3, exp13);
PnextHelper(config, kBRegSize, in2, in3, exp23);
PnextHelper(config, kBRegSize, in3, in3, exp33);
PnextHelper(config, kBRegSize, in4, in3, exp43);
PnextHelper(config, kBRegSize, in0, in4, exp04);
PnextHelper(config, kBRegSize, in1, in4, exp14);
PnextHelper(config, kBRegSize, in2, in4, exp24);
PnextHelper(config, kBRegSize, in3, in4, exp34);
PnextHelper(config, kBRegSize, in4, in4, exp44);
}
TEST_SVE(sve_pnext_h) {
// TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
// (to check propagation if we have a large VL), but few enough to make the
// test easy to read.
// For now, we just use kPRegMinSize so that the test works anywhere.
int in0[] = {0, 0, 0, 0, 0, 0, 0, 0};
int in1[] = {0, 0, 0, 1, 0, 2, 1, 0};
int in2[] = {0, 1, 2, 0, 2, 0, 2, 0};
int in3[] = {0, 0, 0, 3, 0, 0, 0, 3};
int in4[] = {3, 0, 0, 0, 0, 0, 0, 0};
// Pnext activates the next element that is true in pg, after the last-active
// element in pn. If all pn elements are false (as in in0), it starts looking
// at element 0.
//
// As for other SVE instructions, elements are only considered to be active if
// the _first_ bit in each field is one. Other bits are ignored.
// There are no active lanes in in0, so the result is simply the first active
// lane from pg.
int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp10[] = {0, 0, 0, 0, 0, 0, 1, 0};
int exp20[] = {0, 1, 0, 0, 0, 0, 0, 0};
int exp30[] = {0, 0, 0, 0, 0, 0, 0, 1};
int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0};
// | The last active lane in in1 is here.
// v
int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp21[] = {0, 1, 0, 0, 0, 0, 0, 0};
int exp31[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0};
// | The last active lane in in2 is here.
// v
int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0};
// | The last active lane in in3 is here.
// v
int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp23[] = {0, 1, 0, 0, 0, 0, 0, 0};
int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0};
// | The last active lane in in4 is here.
// v
int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0};
int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0};
PnextHelper(config, kHRegSize, in0, in0, exp00);
PnextHelper(config, kHRegSize, in1, in0, exp10);
PnextHelper(config, kHRegSize, in2, in0, exp20);
PnextHelper(config, kHRegSize, in3, in0, exp30);
PnextHelper(config, kHRegSize, in4, in0, exp40);
PnextHelper(config, kHRegSize, in0, in1, exp01);
PnextHelper(config, kHRegSize, in1, in1, exp11);
PnextHelper(config, kHRegSize, in2, in1, exp21);
PnextHelper(config, kHRegSize, in3, in1, exp31);
PnextHelper(config, kHRegSize, in4, in1, exp41);
PnextHelper(config, kHRegSize, in0, in2, exp02);
PnextHelper(config, kHRegSize, in1, in2, exp12);
PnextHelper(config, kHRegSize, in2, in2, exp22);
PnextHelper(config, kHRegSize, in3, in2, exp32);
PnextHelper(config, kHRegSize, in4, in2, exp42);
PnextHelper(config, kHRegSize, in0, in3, exp03);
PnextHelper(config, kHRegSize, in1, in3, exp13);
PnextHelper(config, kHRegSize, in2, in3, exp23);
PnextHelper(config, kHRegSize, in3, in3, exp33);
PnextHelper(config, kHRegSize, in4, in3, exp43);
PnextHelper(config, kHRegSize, in0, in4, exp04);
PnextHelper(config, kHRegSize, in1, in4, exp14);
PnextHelper(config, kHRegSize, in2, in4, exp24);
PnextHelper(config, kHRegSize, in3, in4, exp34);
PnextHelper(config, kHRegSize, in4, in4, exp44);
}
TEST_SVE(sve_pnext_s) {
// TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
// (to check propagation if we have a large VL), but few enough to make the
// test easy to read.
// For now, we just use kPRegMinSize so that the test works anywhere.
int in0[] = {0xe, 0xc, 0x8, 0x0};
int in1[] = {0x0, 0x2, 0x0, 0x1};
int in2[] = {0x0, 0x1, 0xf, 0x0};
int in3[] = {0xf, 0x0, 0x0, 0x0};
// Pnext activates the next element that is true in pg, after the last-active
// element in pn. If all pn elements are false (as in in0), it starts looking
// at element 0.
//
// As for other SVE instructions, elements are only considered to be active if
// the _first_ bit in each field is one. Other bits are ignored.
// There are no active lanes in in0, so the result is simply the first active
// lane from pg.
int exp00[] = {0, 0, 0, 0};
int exp10[] = {0, 0, 0, 1};
int exp20[] = {0, 0, 1, 0};
int exp30[] = {1, 0, 0, 0};
// | The last active lane in in1 is here.
// v
int exp01[] = {0, 0, 0, 0};
int exp11[] = {0, 0, 0, 0};
int exp21[] = {0, 0, 1, 0};
int exp31[] = {1, 0, 0, 0};
// | The last active lane in in2 is here.
// v
int exp02[] = {0, 0, 0, 0};
int exp12[] = {0, 0, 0, 0};
int exp22[] = {0, 0, 0, 0};
int exp32[] = {1, 0, 0, 0};
// | The last active lane in in3 is here.
// v
int exp03[] = {0, 0, 0, 0};
int exp13[] = {0, 0, 0, 0};
int exp23[] = {0, 0, 0, 0};
int exp33[] = {0, 0, 0, 0};
PnextHelper(config, kSRegSize, in0, in0, exp00);
PnextHelper(config, kSRegSize, in1, in0, exp10);
PnextHelper(config, kSRegSize, in2, in0, exp20);
PnextHelper(config, kSRegSize, in3, in0, exp30);
PnextHelper(config, kSRegSize, in0, in1, exp01);
PnextHelper(config, kSRegSize, in1, in1, exp11);
PnextHelper(config, kSRegSize, in2, in1, exp21);
PnextHelper(config, kSRegSize, in3, in1, exp31);
PnextHelper(config, kSRegSize, in0, in2, exp02);
PnextHelper(config, kSRegSize, in1, in2, exp12);
PnextHelper(config, kSRegSize, in2, in2, exp22);
PnextHelper(config, kSRegSize, in3, in2, exp32);
PnextHelper(config, kSRegSize, in0, in3, exp03);
PnextHelper(config, kSRegSize, in1, in3, exp13);
PnextHelper(config, kSRegSize, in2, in3, exp23);
PnextHelper(config, kSRegSize, in3, in3, exp33);
}
TEST_SVE(sve_pnext_d) {
// TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
// (to check propagation if we have a large VL), but few enough to make the
// test easy to read.
// For now, we just use kPRegMinSize so that the test works anywhere.
int in0[] = {0xfe, 0xf0};
int in1[] = {0x00, 0x55};
int in2[] = {0x33, 0xff};
// Pnext activates the next element that is true in pg, after the last-active
// element in pn. If all pn elements are false (as in in0), it starts looking
// at element 0.
//
// As for other SVE instructions, elements are only considered to be active if
// the _first_ bit in each field is one. Other bits are ignored.
// There are no active lanes in in0, so the result is simply the first active
// lane from pg.
int exp00[] = {0, 0};
int exp10[] = {0, 1};
int exp20[] = {0, 1};
// | The last active lane in in1 is here.
// v
int exp01[] = {0, 0};
int exp11[] = {0, 0};
int exp21[] = {1, 0};
// | The last active lane in in2 is here.
// v
int exp02[] = {0, 0};
int exp12[] = {0, 0};
int exp22[] = {0, 0};
PnextHelper(config, kDRegSize, in0, in0, exp00);
PnextHelper(config, kDRegSize, in1, in0, exp10);
PnextHelper(config, kDRegSize, in2, in0, exp20);
PnextHelper(config, kDRegSize, in0, in1, exp01);
PnextHelper(config, kDRegSize, in1, in1, exp11);
PnextHelper(config, kDRegSize, in2, in1, exp21);
PnextHelper(config, kDRegSize, in0, in2, exp02);
PnextHelper(config, kDRegSize, in1, in2, exp12);
PnextHelper(config, kDRegSize, in2, in2, exp22);
}
TEST_SVE(sve_pnext_alias) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Check that the Simulator behaves correctly when all arguments are aliased.
int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
int in_s[] = {0, 1, 1, 0};
int in_d[] = {1, 1};
Initialise(&masm, p0.VnB(), in_b);
Initialise(&masm, p1.VnH(), in_h);
Initialise(&masm, p2.VnS(), in_s);
Initialise(&masm, p3.VnD(), in_d);
// Initialise NZCV to an impossible value, to check that we actually write it.
__ Mov(x10, NZCVFlag);
__ Msr(NZCV, x10);
__ Pnext(p0.VnB(), p0, p0.VnB());
__ Mrs(x0, NZCV);
__ Msr(NZCV, x10);
__ Pnext(p1.VnB(), p1, p1.VnB());
__ Mrs(x1, NZCV);
__ Msr(NZCV, x10);
__ Pnext(p2.VnB(), p2, p2.VnB());
__ Mrs(x2, NZCV);
__ Msr(NZCV, x10);
__ Pnext(p3.VnB(), p3, p3.VnB());
__ Mrs(x3, NZCV);
END();
if (CAN_RUN()) {
RUN();
// Since pg.Is(pdn), there can be no active lanes in pg above the last
// active lane in pdn, so the result should always be zero.
ASSERT_EQUAL_SVE(0, p0.VnB());
ASSERT_EQUAL_SVE(0, p1.VnH());
ASSERT_EQUAL_SVE(0, p2.VnS());
ASSERT_EQUAL_SVE(0, p3.VnD());
ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x0);
ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x1);
ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x2);
ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x3);
}
}
static void PtrueHelper(Test* config,
unsigned lane_size_in_bits,
FlagsUpdate s = LeaveFlags) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
PRegisterWithLaneSize p[kNumberOfPRegisters];
for (unsigned i = 0; i < kNumberOfPRegisters; i++) {
p[i] = PRegister(i).WithLaneSize(lane_size_in_bits);
}
// Initialise NZCV to an impossible value, to check that we actually write it.
StatusFlags nzcv_unmodified = NZCVFlag;
__ Mov(x20, nzcv_unmodified);
// We don't have enough registers to conveniently test every pattern, so take
// samples from each group.
__ Msr(NZCV, x20);
__ Ptrue(p[0], SVE_POW2, s);
__ Mrs(x0, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[1], SVE_VL1, s);
__ Mrs(x1, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[2], SVE_VL2, s);
__ Mrs(x2, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[3], SVE_VL5, s);
__ Mrs(x3, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[4], SVE_VL6, s);
__ Mrs(x4, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[5], SVE_VL8, s);
__ Mrs(x5, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[6], SVE_VL16, s);
__ Mrs(x6, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[7], SVE_VL64, s);
__ Mrs(x7, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[8], SVE_VL256, s);
__ Mrs(x8, NZCV);
{
// We have to use the Assembler to use values not defined by
// SVEPredicateConstraint, so call `ptrues` directly..
typedef void (
MacroAssembler::*AssemblePtrueFn)(const PRegisterWithLaneSize& pd,
int pattern);
AssemblePtrueFn assemble = &MacroAssembler::ptrue;
if (s == SetFlags) {
assemble = &MacroAssembler::ptrues;
}
ExactAssemblyScope guard(&masm, 12 * kInstructionSize);
__ msr(NZCV, x20);
(masm.*assemble)(p[9], 0xe);
__ mrs(x9, NZCV);
__ msr(NZCV, x20);
(masm.*assemble)(p[10], 0x16);
__ mrs(x10, NZCV);
__ msr(NZCV, x20);
(masm.*assemble)(p[11], 0x1a);
__ mrs(x11, NZCV);
__ msr(NZCV, x20);
(masm.*assemble)(p[12], 0x1c);
__ mrs(x12, NZCV);
}
__ Msr(NZCV, x20);
__ Ptrue(p[13], SVE_MUL4, s);
__ Mrs(x13, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[14], SVE_MUL3, s);
__ Mrs(x14, NZCV);
__ Msr(NZCV, x20);
__ Ptrue(p[15], SVE_ALL, s);
__ Mrs(x15, NZCV);
END();
if (CAN_RUN()) {
RUN();
int all = core.GetSVELaneCount(lane_size_in_bits);
int pow2 = 1 << HighestSetBitPosition(all);
int mul4 = all - (all % 4);
int mul3 = all - (all % 3);
// Check P register results.
for (int i = 0; i < all; i++) {
ASSERT_EQUAL_SVE_LANE(i < pow2, p[0], i);
ASSERT_EQUAL_SVE_LANE((all >= 1) && (i < 1), p[1], i);
ASSERT_EQUAL_SVE_LANE((all >= 2) && (i < 2), p[2], i);
ASSERT_EQUAL_SVE_LANE((all >= 5) && (i < 5), p[3], i);
ASSERT_EQUAL_SVE_LANE((all >= 6) && (i < 6), p[4], i);
ASSERT_EQUAL_SVE_LANE((all >= 8) && (i < 8), p[5], i);
ASSERT_EQUAL_SVE_LANE((all >= 16) && (i < 16), p[6], i);
ASSERT_EQUAL_SVE_LANE((all >= 64) && (i < 64), p[7], i);
ASSERT_EQUAL_SVE_LANE((all >= 256) && (i < 256), p[8], i);
ASSERT_EQUAL_SVE_LANE(false, p[9], i);
ASSERT_EQUAL_SVE_LANE(false, p[10], i);
ASSERT_EQUAL_SVE_LANE(false, p[11], i);
ASSERT_EQUAL_SVE_LANE(false, p[12], i);
ASSERT_EQUAL_SVE_LANE(i < mul4, p[13], i);
ASSERT_EQUAL_SVE_LANE(i < mul3, p[14], i);
ASSERT_EQUAL_SVE_LANE(true, p[15], i);
}
// Check NZCV results.
if (s == LeaveFlags) {
// No flags should have been updated.
for (int i = 0; i <= 15; i++) {
ASSERT_EQUAL_64(nzcv_unmodified, XRegister(i));
}
} else {
StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
StatusFlags nonzero = SVEFirstFlag;
// POW2
ASSERT_EQUAL_64(nonzero, x0);
// VL*
ASSERT_EQUAL_64((all >= 1) ? nonzero : zero, x1);
ASSERT_EQUAL_64((all >= 2) ? nonzero : zero, x2);
ASSERT_EQUAL_64((all >= 5) ? nonzero : zero, x3);
ASSERT_EQUAL_64((all >= 6) ? nonzero : zero, x4);
ASSERT_EQUAL_64((all >= 8) ? nonzero : zero, x5);
ASSERT_EQUAL_64((all >= 16) ? nonzero : zero, x6);
ASSERT_EQUAL_64((all >= 64) ? nonzero : zero, x7);
ASSERT_EQUAL_64((all >= 256) ? nonzero : zero, x8);
// #uimm5
ASSERT_EQUAL_64(zero, x9);
ASSERT_EQUAL_64(zero, x10);
ASSERT_EQUAL_64(zero, x11);
ASSERT_EQUAL_64(zero, x12);
// MUL*
ASSERT_EQUAL_64((all >= 4) ? nonzero : zero, x13);
ASSERT_EQUAL_64((all >= 3) ? nonzero : zero, x14);
// ALL
ASSERT_EQUAL_64(nonzero, x15);
}
}
}
TEST_SVE(sve_ptrue_b) { PtrueHelper(config, kBRegSize, LeaveFlags); }
TEST_SVE(sve_ptrue_h) { PtrueHelper(config, kHRegSize, LeaveFlags); }
TEST_SVE(sve_ptrue_s) { PtrueHelper(config, kSRegSize, LeaveFlags); }
TEST_SVE(sve_ptrue_d) { PtrueHelper(config, kDRegSize, LeaveFlags); }
TEST_SVE(sve_ptrues_b) { PtrueHelper(config, kBRegSize, SetFlags); }
TEST_SVE(sve_ptrues_h) { PtrueHelper(config, kHRegSize, SetFlags); }
TEST_SVE(sve_ptrues_s) { PtrueHelper(config, kSRegSize, SetFlags); }
TEST_SVE(sve_ptrues_d) { PtrueHelper(config, kDRegSize, SetFlags); }
TEST_SVE(sve_pfalse) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Initialise non-zero inputs.
__ Ptrue(p0.VnB());
__ Ptrue(p1.VnH());
__ Ptrue(p2.VnS());
__ Ptrue(p3.VnD());
// The instruction only supports B-sized lanes, but the lane size has no
// logical effect, so the MacroAssembler accepts anything.
__ Pfalse(p0.VnB());
__ Pfalse(p1.VnH());
__ Pfalse(p2.VnS());
__ Pfalse(p3.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(0, p0.VnB());
ASSERT_EQUAL_SVE(0, p1.VnB());
ASSERT_EQUAL_SVE(0, p2.VnB());
ASSERT_EQUAL_SVE(0, p3.VnB());
}
}
TEST_SVE(sve_ptest) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Initialise NZCV to a known (impossible) value.
StatusFlags nzcv_unmodified = NZCVFlag;
__ Mov(x0, nzcv_unmodified);
__ Msr(NZCV, x0);
// Construct some test inputs.
int in2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0};
int in3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0};
int in4[] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0};
__ Pfalse(p0.VnB());
__ Ptrue(p1.VnB());
Initialise(&masm, p2.VnB(), in2);
Initialise(&masm, p3.VnB(), in3);
Initialise(&masm, p4.VnB(), in4);
// All-inactive pg.
__ Ptest(p0, p0.VnB());
__ Mrs(x0, NZCV);
__ Ptest(p0, p1.VnB());
__ Mrs(x1, NZCV);
__ Ptest(p0, p2.VnB());
__ Mrs(x2, NZCV);
__ Ptest(p0, p3.VnB());
__ Mrs(x3, NZCV);
__ Ptest(p0, p4.VnB());
__ Mrs(x4, NZCV);
// All-active pg.
__ Ptest(p1, p0.VnB());
__ Mrs(x5, NZCV);
__ Ptest(p1, p1.VnB());
__ Mrs(x6, NZCV);
__ Ptest(p1, p2.VnB());
__ Mrs(x7, NZCV);
__ Ptest(p1, p3.VnB());
__ Mrs(x8, NZCV);
__ Ptest(p1, p4.VnB());
__ Mrs(x9, NZCV);
// Combinations of other inputs.
__ Ptest(p2, p2.VnB());
__ Mrs(x20, NZCV);
__ Ptest(p2, p3.VnB());
__ Mrs(x21, NZCV);
__ Ptest(p2, p4.VnB());
__ Mrs(x22, NZCV);
__ Ptest(p3, p2.VnB());
__ Mrs(x23, NZCV);
__ Ptest(p3, p3.VnB());
__ Mrs(x24, NZCV);
__ Ptest(p3, p4.VnB());
__ Mrs(x25, NZCV);
__ Ptest(p4, p2.VnB());
__ Mrs(x26, NZCV);
__ Ptest(p4, p3.VnB());
__ Mrs(x27, NZCV);
__ Ptest(p4, p4.VnB());
__ Mrs(x28, NZCV);
END();
if (CAN_RUN()) {
RUN();
StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
// If pg is all inactive, the value of pn is irrelevant.
ASSERT_EQUAL_64(zero, x0);
ASSERT_EQUAL_64(zero, x1);
ASSERT_EQUAL_64(zero, x2);
ASSERT_EQUAL_64(zero, x3);
ASSERT_EQUAL_64(zero, x4);
// All-active pg.
ASSERT_EQUAL_64(zero, x5); // All-inactive pn.
ASSERT_EQUAL_64(SVEFirstFlag, x6); // All-active pn.
// Other pn inputs are non-zero, but the first and last lanes are inactive.
ASSERT_EQUAL_64(SVENotLastFlag, x7);
ASSERT_EQUAL_64(SVENotLastFlag, x8);
ASSERT_EQUAL_64(SVENotLastFlag, x9);
// Other inputs.
ASSERT_EQUAL_64(SVEFirstFlag, x20); // pg: in2, pn: in2
ASSERT_EQUAL_64(NoFlag, x21); // pg: in2, pn: in3
ASSERT_EQUAL_64(zero, x22); // pg: in2, pn: in4
ASSERT_EQUAL_64(static_cast<StatusFlags>(SVEFirstFlag | SVENotLastFlag),
x23); // pg: in3, pn: in2
ASSERT_EQUAL_64(SVEFirstFlag, x24); // pg: in3, pn: in3
ASSERT_EQUAL_64(zero, x25); // pg: in3, pn: in4
ASSERT_EQUAL_64(zero, x26); // pg: in4, pn: in2
ASSERT_EQUAL_64(zero, x27); // pg: in4, pn: in3
ASSERT_EQUAL_64(SVEFirstFlag, x28); // pg: in4, pn: in4
}
}
TEST_SVE(sve_cntp) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// There are {7, 5, 2, 1} active {B, H, S, D} lanes.
int p0_inputs[] = {0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
Initialise(&masm, p0.VnB(), p0_inputs);
// With an all-true predicate, these instructions measure the vector length.
__ Ptrue(p10.VnB());
__ Ptrue(p11.VnH());
__ Ptrue(p12.VnS());
__ Ptrue(p13.VnD());
// `ptrue p10.b` provides an all-active pg.
__ Cntp(x10, p10, p10.VnB());
__ Cntp(x11, p10, p11.VnH());
__ Cntp(x12, p10, p12.VnS());
__ Cntp(x13, p10, p13.VnD());
// Check that the predicate mask is applied properly.
__ Cntp(x14, p10, p10.VnB());
__ Cntp(x15, p11, p10.VnB());
__ Cntp(x16, p12, p10.VnB());
__ Cntp(x17, p13, p10.VnB());
// Check other patterns (including some ignored bits).
__ Cntp(x0, p10, p0.VnB());
__ Cntp(x1, p10, p0.VnH());
__ Cntp(x2, p10, p0.VnS());
__ Cntp(x3, p10, p0.VnD());
__ Cntp(x4, p0, p10.VnB());
__ Cntp(x5, p0, p10.VnH());
__ Cntp(x6, p0, p10.VnS());
__ Cntp(x7, p0, p10.VnD());
END();
if (CAN_RUN()) {
RUN();
int vl_b = core.GetSVELaneCount(kBRegSize);
int vl_h = core.GetSVELaneCount(kHRegSize);
int vl_s = core.GetSVELaneCount(kSRegSize);
int vl_d = core.GetSVELaneCount(kDRegSize);
// Check all-active predicates in various combinations.
ASSERT_EQUAL_64(vl_b, x10);
ASSERT_EQUAL_64(vl_h, x11);
ASSERT_EQUAL_64(vl_s, x12);
ASSERT_EQUAL_64(vl_d, x13);
ASSERT_EQUAL_64(vl_b, x14);
ASSERT_EQUAL_64(vl_h, x15);
ASSERT_EQUAL_64(vl_s, x16);
ASSERT_EQUAL_64(vl_d, x17);
// Check that irrelevant bits are properly ignored.
ASSERT_EQUAL_64(7, x0);
ASSERT_EQUAL_64(5, x1);
ASSERT_EQUAL_64(2, x2);
ASSERT_EQUAL_64(1, x3);
ASSERT_EQUAL_64(7, x4);
ASSERT_EQUAL_64(5, x5);
ASSERT_EQUAL_64(2, x6);
ASSERT_EQUAL_64(1, x7);
}
}
typedef void (MacroAssembler::*CntFn)(const Register& dst,
int pattern,
int multiplier);
template <typename T>
void GenerateCntSequence(MacroAssembler* masm,
CntFn cnt,
T acc_value,
int multiplier) {
// Initialise accumulators.
masm->Mov(x0, acc_value);
masm->Mov(x1, acc_value);
masm->Mov(x2, acc_value);
masm->Mov(x3, acc_value);
masm->Mov(x4, acc_value);
masm->Mov(x5, acc_value);
masm->Mov(x6, acc_value);
masm->Mov(x7, acc_value);
masm->Mov(x8, acc_value);
masm->Mov(x9, acc_value);
masm->Mov(x10, acc_value);
masm->Mov(x11, acc_value);
masm->Mov(x12, acc_value);
masm->Mov(x13, acc_value);
masm->Mov(x14, acc_value);
masm->Mov(x15, acc_value);
masm->Mov(x18, acc_value);
masm->Mov(x19, acc_value);
masm->Mov(x20, acc_value);
masm->Mov(x21, acc_value);
(masm->*cnt)(Register(0, sizeof(T) * kBitsPerByte), SVE_POW2, multiplier);
(masm->*cnt)(Register(1, sizeof(T) * kBitsPerByte), SVE_VL1, multiplier);
(masm->*cnt)(Register(2, sizeof(T) * kBitsPerByte), SVE_VL2, multiplier);
(masm->*cnt)(Register(3, sizeof(T) * kBitsPerByte), SVE_VL3, multiplier);
(masm->*cnt)(Register(4, sizeof(T) * kBitsPerByte), SVE_VL4, multiplier);
(masm->*cnt)(Register(5, sizeof(T) * kBitsPerByte), SVE_VL5, multiplier);
(masm->*cnt)(Register(6, sizeof(T) * kBitsPerByte), SVE_VL6, multiplier);
(masm->*cnt)(Register(7, sizeof(T) * kBitsPerByte), SVE_VL7, multiplier);
(masm->*cnt)(Register(8, sizeof(T) * kBitsPerByte), SVE_VL8, multiplier);
(masm->*cnt)(Register(9, sizeof(T) * kBitsPerByte), SVE_VL16, multiplier);
(masm->*cnt)(Register(10, sizeof(T) * kBitsPerByte), SVE_VL32, multiplier);
(masm->*cnt)(Register(11, sizeof(T) * kBitsPerByte), SVE_VL64, multiplier);
(masm->*cnt)(Register(12, sizeof(T) * kBitsPerByte), SVE_VL128, multiplier);
(masm->*cnt)(Register(13, sizeof(T) * kBitsPerByte), SVE_VL256, multiplier);
(masm->*cnt)(Register(14, sizeof(T) * kBitsPerByte), 16, multiplier);
(masm->*cnt)(Register(15, sizeof(T) * kBitsPerByte), 23, multiplier);
(masm->*cnt)(Register(18, sizeof(T) * kBitsPerByte), 28, multiplier);
(masm->*cnt)(Register(19, sizeof(T) * kBitsPerByte), SVE_MUL4, multiplier);
(masm->*cnt)(Register(20, sizeof(T) * kBitsPerByte), SVE_MUL3, multiplier);
(masm->*cnt)(Register(21, sizeof(T) * kBitsPerByte), SVE_ALL, multiplier);
}
int FixedVL(int fixed, int length) {
VIXL_ASSERT(((fixed >= 1) && (fixed <= 8)) || (fixed == 16) ||
(fixed == 32) || (fixed == 64) || (fixed == 128) ||
(fixed = 256));
return (length >= fixed) ? fixed : 0;
}
static void CntHelper(Test* config,
CntFn cnt,
int multiplier,
int lane_size_in_bits,
int64_t acc_value = 0,
bool is_increment = true) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
GenerateCntSequence(&masm, cnt, acc_value, multiplier);
END();
if (CAN_RUN()) {
RUN();
int all = core.GetSVELaneCount(lane_size_in_bits);
int pow2 = 1 << HighestSetBitPosition(all);
int mul4 = all - (all % 4);
int mul3 = all - (all % 3);
multiplier = is_increment ? multiplier : -multiplier;
ASSERT_EQUAL_64(acc_value + (multiplier * pow2), x0);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(1, all)), x1);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(2, all)), x2);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(3, all)), x3);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(4, all)), x4);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(5, all)), x5);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(6, all)), x6);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(7, all)), x7);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(8, all)), x8);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(16, all)), x9);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(32, all)), x10);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(64, all)), x11);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(128, all)), x12);
ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(256, all)), x13);
ASSERT_EQUAL_64(acc_value, x14);
ASSERT_EQUAL_64(acc_value, x15);
ASSERT_EQUAL_64(acc_value, x18);
ASSERT_EQUAL_64(acc_value + (multiplier * mul4), x19);
ASSERT_EQUAL_64(acc_value + (multiplier * mul3), x20);
ASSERT_EQUAL_64(acc_value + (multiplier * all), x21);
}
}
static void IncHelper(Test* config,
CntFn cnt,
int multiplier,
int lane_size_in_bits,
int64_t acc_value) {
CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
}
static void DecHelper(Test* config,
CntFn cnt,
int multiplier,
int lane_size_in_bits,
int64_t acc_value) {
CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
}
TEST_SVE(sve_cntb) {
CntHelper(config, &MacroAssembler::Cntb, 1, kBRegSize);
CntHelper(config, &MacroAssembler::Cntb, 2, kBRegSize);
CntHelper(config, &MacroAssembler::Cntb, 15, kBRegSize);
CntHelper(config, &MacroAssembler::Cntb, 16, kBRegSize);
}
TEST_SVE(sve_cnth) {
CntHelper(config, &MacroAssembler::Cnth, 1, kHRegSize);
CntHelper(config, &MacroAssembler::Cnth, 2, kHRegSize);
CntHelper(config, &MacroAssembler::Cnth, 15, kHRegSize);
CntHelper(config, &MacroAssembler::Cnth, 16, kHRegSize);
}
TEST_SVE(sve_cntw) {
CntHelper(config, &MacroAssembler::Cntw, 1, kWRegSize);
CntHelper(config, &MacroAssembler::Cntw, 2, kWRegSize);
CntHelper(config, &MacroAssembler::Cntw, 15, kWRegSize);
CntHelper(config, &MacroAssembler::Cntw, 16, kWRegSize);
}
TEST_SVE(sve_cntd) {
CntHelper(config, &MacroAssembler::Cntd, 1, kDRegSize);
CntHelper(config, &MacroAssembler::Cntd, 2, kDRegSize);
CntHelper(config, &MacroAssembler::Cntd, 15, kDRegSize);
CntHelper(config, &MacroAssembler::Cntd, 16, kDRegSize);
}
TEST_SVE(sve_decb) {
DecHelper(config, &MacroAssembler::Decb, 1, kBRegSize, 42);
DecHelper(config, &MacroAssembler::Decb, 2, kBRegSize, -1);
DecHelper(config, &MacroAssembler::Decb, 15, kBRegSize, INT64_MIN);
DecHelper(config, &MacroAssembler::Decb, 16, kBRegSize, -42);
}
TEST_SVE(sve_dech) {
DecHelper(config, &MacroAssembler::Dech, 1, kHRegSize, 42);
DecHelper(config, &MacroAssembler::Dech, 2, kHRegSize, -1);
DecHelper(config, &MacroAssembler::Dech, 15, kHRegSize, INT64_MIN);
DecHelper(config, &MacroAssembler::Dech, 16, kHRegSize, -42);
}
TEST_SVE(sve_decw) {
DecHelper(config, &MacroAssembler::Decw, 1, kWRegSize, 42);
DecHelper(config, &MacroAssembler::Decw, 2, kWRegSize, -1);
DecHelper(config, &MacroAssembler::Decw, 15, kWRegSize, INT64_MIN);
DecHelper(config, &MacroAssembler::Decw, 16, kWRegSize, -42);
}
TEST_SVE(sve_decd) {
DecHelper(config, &MacroAssembler::Decd, 1, kDRegSize, 42);
DecHelper(config, &MacroAssembler::Decd, 2, kDRegSize, -1);
DecHelper(config, &MacroAssembler::Decd, 15, kDRegSize, INT64_MIN);
DecHelper(config, &MacroAssembler::Decd, 16, kDRegSize, -42);
}
TEST_SVE(sve_incb) {
IncHelper(config, &MacroAssembler::Incb, 1, kBRegSize, 42);
IncHelper(config, &MacroAssembler::Incb, 2, kBRegSize, -1);
IncHelper(config, &MacroAssembler::Incb, 15, kBRegSize, INT64_MAX);
IncHelper(config, &MacroAssembler::Incb, 16, kBRegSize, -42);
}
TEST_SVE(sve_inch) {
IncHelper(config, &MacroAssembler::Inch, 1, kHRegSize, 42);
IncHelper(config, &MacroAssembler::Inch, 2, kHRegSize, -1);
IncHelper(config, &MacroAssembler::Inch, 15, kHRegSize, INT64_MAX);
IncHelper(config, &MacroAssembler::Inch, 16, kHRegSize, -42);
}
TEST_SVE(sve_incw) {
IncHelper(config, &MacroAssembler::Incw, 1, kWRegSize, 42);
IncHelper(config, &MacroAssembler::Incw, 2, kWRegSize, -1);
IncHelper(config, &MacroAssembler::Incw, 15, kWRegSize, INT64_MAX);
IncHelper(config, &MacroAssembler::Incw, 16, kWRegSize, -42);
}
TEST_SVE(sve_incd) {
IncHelper(config, &MacroAssembler::Incd, 1, kDRegSize, 42);
IncHelper(config, &MacroAssembler::Incd, 2, kDRegSize, -1);
IncHelper(config, &MacroAssembler::Incd, 15, kDRegSize, INT64_MAX);
IncHelper(config, &MacroAssembler::Incd, 16, kDRegSize, -42);
}
template <typename T>
static T QAdd(T x, int y) {
VIXL_ASSERT(y > INT_MIN);
T result;
T min = std::numeric_limits<T>::min();
T max = std::numeric_limits<T>::max();
if ((x >= 0) && (y >= 0)) {
// For positive a and b, saturate at max.
result = (max - x) < static_cast<T>(y) ? max : x + y;
} else if ((y < 0) && ((x < 0) || (min == 0))) {
// For negative b, where either a negative or T unsigned.
result = (x - min) < static_cast<T>(-y) ? min : x + y;
} else {
result = x + y;
}
return result;
}
template <typename T>
static void QIncDecHelper(Test* config,
CntFn cnt,
int multiplier,
int lane_size_in_bits,
T acc_value,
bool is_increment) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
GenerateCntSequence(&masm, cnt, acc_value, multiplier);
END();
if (CAN_RUN()) {
RUN();
int all = core.GetSVELaneCount(lane_size_in_bits);
int pow2 = 1 << HighestSetBitPosition(all);
int mul4 = all - (all % 4);
int mul3 = all - (all % 3);
multiplier = is_increment ? multiplier : -multiplier;
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
ASSERT_EQUAL_64(acc_value, x14);
ASSERT_EQUAL_64(acc_value, x15);
ASSERT_EQUAL_64(acc_value, x18);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
}
}
template <typename T>
static void QIncHelper(Test* config,
CntFn cnt,
int multiplier,
int lane_size_in_bits,
T acc_value) {
QIncDecHelper<T>(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
}
template <typename T>
static void QDecHelper(Test* config,
CntFn cnt,
int multiplier,
int lane_size_in_bits,
T acc_value) {
QIncDecHelper<T>(config,
cnt,
multiplier,
lane_size_in_bits,
acc_value,
false);
}
TEST_SVE(sve_sqdecb) {
int64_t bigneg = INT64_MIN + 42;
int64_t bigpos = INT64_MAX - 42;
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 2, kBRegSize, bigneg);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 16, kBRegSize, bigpos);
}
TEST_SVE(sve_sqdech) {
int64_t bigneg = INT64_MIN + 42;
int64_t bigpos = INT64_MAX - 42;
QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 2, kHRegSize, bigneg);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 16, kHRegSize, bigpos);
}
TEST_SVE(sve_sqdecw) {
int64_t bigneg = INT64_MIN + 42;
int64_t bigpos = INT64_MAX - 42;
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 2, kWRegSize, bigneg);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 16, kWRegSize, bigpos);
}
TEST_SVE(sve_sqdecd) {
int64_t bigneg = INT64_MIN + 42;
int64_t bigpos = INT64_MAX - 42;
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 2, kDRegSize, bigneg);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 16, kDRegSize, bigpos);
}
TEST_SVE(sve_sqincb) {
int64_t bigneg = INT64_MIN + 42;
int64_t bigpos = INT64_MAX - 42;
QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 2, kBRegSize, bigneg);
QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 16, kBRegSize, bigpos);
}
TEST_SVE(sve_sqinch) {
int64_t bigneg = INT64_MIN + 42;
int64_t bigpos = INT64_MAX - 42;
QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 2, kHRegSize, bigneg);
QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 16, kHRegSize, bigpos);
}
TEST_SVE(sve_sqincw) {
int64_t bigneg = INT64_MIN + 42;
int64_t bigpos = INT64_MAX - 42;
QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 2, kWRegSize, bigneg);
QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 16, kWRegSize, bigpos);
}
TEST_SVE(sve_sqincd) {
int64_t bigneg = INT64_MIN + 42;
int64_t bigpos = INT64_MAX - 42;
QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 2, kDRegSize, bigneg);
QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 16, kDRegSize, bigpos);
}
TEST_SVE(sve_uqdecb) {
int32_t big32 = UINT32_MAX - 42;
int64_t big64 = UINT64_MAX - 42;
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big32);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big64);
}
TEST_SVE(sve_uqdech) {
int32_t big32 = UINT32_MAX - 42;
int64_t big64 = UINT64_MAX - 42;
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big32);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big64);
}
TEST_SVE(sve_uqdecw) {
int32_t big32 = UINT32_MAX - 42;
int64_t big64 = UINT64_MAX - 42;
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big32);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big64);
}
TEST_SVE(sve_uqdecd) {
int32_t big32 = UINT32_MAX - 42;
int64_t big64 = UINT64_MAX - 42;
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big32);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big64);
}
TEST_SVE(sve_uqincb) {
int32_t big32 = UINT32_MAX - 42;
int64_t big64 = UINT64_MAX - 42;
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big32);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big64);
}
TEST_SVE(sve_uqinch) {
int32_t big32 = UINT32_MAX - 42;
int64_t big64 = UINT64_MAX - 42;
QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big32);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big64);
}
TEST_SVE(sve_uqincw) {
int32_t big32 = UINT32_MAX - 42;
int64_t big64 = UINT64_MAX - 42;
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big32);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big64);
}
TEST_SVE(sve_uqincd) {
int32_t big32 = UINT32_MAX - 42;
int64_t big64 = UINT64_MAX - 42;
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big32);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big64);
}
typedef void (MacroAssembler::*QIncDecXWFn)(const Register& dst,
const Register& src,
int pattern,
int multiplier);
static void QIncDecXWHelper(Test* config,
QIncDecXWFn cnt,
int multiplier,
int lane_size_in_bits,
int32_t acc_value,
bool is_increment) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Initialise accumulators.
__ Mov(x0, acc_value);
__ Mov(x1, acc_value);
__ Mov(x2, acc_value);
__ Mov(x3, acc_value);
__ Mov(x4, acc_value);
__ Mov(x5, acc_value);
__ Mov(x6, acc_value);
__ Mov(x7, acc_value);
__ Mov(x8, acc_value);
__ Mov(x9, acc_value);
__ Mov(x10, acc_value);
__ Mov(x11, acc_value);
__ Mov(x12, acc_value);
__ Mov(x13, acc_value);
__ Mov(x14, acc_value);
__ Mov(x15, acc_value);
__ Mov(x18, acc_value);
__ Mov(x19, acc_value);
__ Mov(x20, acc_value);
__ Mov(x21, acc_value);
(masm.*cnt)(x0, w0, SVE_POW2, multiplier);
(masm.*cnt)(x1, w1, SVE_VL1, multiplier);
(masm.*cnt)(x2, w2, SVE_VL2, multiplier);
(masm.*cnt)(x3, w3, SVE_VL3, multiplier);
(masm.*cnt)(x4, w4, SVE_VL4, multiplier);
(masm.*cnt)(x5, w5, SVE_VL5, multiplier);
(masm.*cnt)(x6, w6, SVE_VL6, multiplier);
(masm.*cnt)(x7, w7, SVE_VL7, multiplier);
(masm.*cnt)(x8, w8, SVE_VL8, multiplier);
(masm.*cnt)(x9, w9, SVE_VL16, multiplier);
(masm.*cnt)(x10, w10, SVE_VL32, multiplier);
(masm.*cnt)(x11, w11, SVE_VL64, multiplier);
(masm.*cnt)(x12, w12, SVE_VL128, multiplier);
(masm.*cnt)(x13, w13, SVE_VL256, multiplier);
(masm.*cnt)(x14, w14, 16, multiplier);
(masm.*cnt)(x15, w15, 23, multiplier);
(masm.*cnt)(x18, w18, 28, multiplier);
(masm.*cnt)(x19, w19, SVE_MUL4, multiplier);
(masm.*cnt)(x20, w20, SVE_MUL3, multiplier);
(masm.*cnt)(x21, w21, SVE_ALL, multiplier);
END();
if (CAN_RUN()) {
RUN();
int all = core.GetSVELaneCount(lane_size_in_bits);
int pow2 = 1 << HighestSetBitPosition(all);
int mul4 = all - (all % 4);
int mul3 = all - (all % 3);
multiplier = is_increment ? multiplier : -multiplier;
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
ASSERT_EQUAL_64(acc_value, x14);
ASSERT_EQUAL_64(acc_value, x15);
ASSERT_EQUAL_64(acc_value, x18);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
}
}
static void QIncXWHelper(Test* config,
QIncDecXWFn cnt,
int multiplier,
int lane_size_in_bits,
int32_t acc_value) {
QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
}
static void QDecXWHelper(Test* config,
QIncDecXWFn cnt,
int multiplier,
int lane_size_in_bits,
int32_t acc_value) {
QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
}
TEST_SVE(sve_sqdecb_xw) {
QDecXWHelper(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
QDecXWHelper(config, &MacroAssembler::Sqdecb, 2, kBRegSize, INT32_MIN + 42);
QDecXWHelper(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
QDecXWHelper(config, &MacroAssembler::Sqdecb, 16, kBRegSize, INT32_MAX - 42);
}
TEST_SVE(sve_sqdech_xw) {
QDecXWHelper(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
QDecXWHelper(config, &MacroAssembler::Sqdech, 2, kHRegSize, INT32_MIN + 42);
QDecXWHelper(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
QDecXWHelper(config, &MacroAssembler::Sqdech, 16, kHRegSize, INT32_MAX - 42);
}
TEST_SVE(sve_sqdecw_xw) {
QDecXWHelper(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
QDecXWHelper(config, &MacroAssembler::Sqdecw, 2, kWRegSize, INT32_MIN + 42);
QDecXWHelper(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
QDecXWHelper(config, &MacroAssembler::Sqdecw, 16, kWRegSize, INT32_MAX - 42);
}
TEST_SVE(sve_sqdecd_xw) {
QDecXWHelper(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
QDecXWHelper(config, &MacroAssembler::Sqdecd, 2, kDRegSize, INT32_MIN + 42);
QDecXWHelper(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
QDecXWHelper(config, &MacroAssembler::Sqdecd, 16, kDRegSize, INT32_MAX - 42);
}
TEST_SVE(sve_sqincb_xw) {
QIncXWHelper(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
QIncXWHelper(config, &MacroAssembler::Sqincb, 2, kBRegSize, INT32_MIN + 42);
QIncXWHelper(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
QIncXWHelper(config, &MacroAssembler::Sqincb, 16, kBRegSize, INT32_MAX - 42);
}
TEST_SVE(sve_sqinch_xw) {
QIncXWHelper(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
QIncXWHelper(config, &MacroAssembler::Sqinch, 2, kHRegSize, INT32_MIN + 42);
QIncXWHelper(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
QIncXWHelper(config, &MacroAssembler::Sqinch, 16, kHRegSize, INT32_MAX - 42);
}
TEST_SVE(sve_sqincw_xw) {
QIncXWHelper(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
QIncXWHelper(config, &MacroAssembler::Sqincw, 2, kWRegSize, INT32_MIN + 42);
QIncXWHelper(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
QIncXWHelper(config, &MacroAssembler::Sqincw, 16, kWRegSize, INT32_MAX - 42);
}
TEST_SVE(sve_sqincd_xw) {
QIncXWHelper(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
QIncXWHelper(config, &MacroAssembler::Sqincd, 2, kDRegSize, INT32_MIN + 42);
QIncXWHelper(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
QIncXWHelper(config, &MacroAssembler::Sqincd, 16, kDRegSize, INT32_MAX - 42);
}
typedef void (MacroAssembler::*IncDecZFn)(const ZRegister& dst,
int pattern,
int multiplier);
typedef void (MacroAssembler::*AddSubFn)(const ZRegister& dst,
const ZRegister& src1,
const ZRegister& src2);
static void IncDecZHelper(Test* config,
IncDecZFn fn,
CntFn cnt,
AddSubFn addsub,
int multiplier,
int lane_size_in_bits) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t acc_inputs[] = {0x7766554433221100,
0xffffffffffffffff,
0x0000000000000000,
0xffffffff0000ffff,
0x7fffffffffffffff,
0x8000000000000000,
0x7fffffff7fff7fff,
0x8000000080008000};
for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
for (int j = 0; j < 4; j++) {
InsrHelper(&masm, ZRegister(i, kDRegSize), acc_inputs);
}
}
for (unsigned i = 0; i < 15; i++) {
__ Mov(XRegister(i), 0);
}
(masm.*fn)(z16.WithLaneSize(lane_size_in_bits), SVE_POW2, multiplier);
(masm.*fn)(z17.WithLaneSize(lane_size_in_bits), SVE_VL1, multiplier);
(masm.*fn)(z18.WithLaneSize(lane_size_in_bits), SVE_VL2, multiplier);
(masm.*fn)(z19.WithLaneSize(lane_size_in_bits), SVE_VL3, multiplier);
(masm.*fn)(z20.WithLaneSize(lane_size_in_bits), SVE_VL4, multiplier);
(masm.*fn)(z21.WithLaneSize(lane_size_in_bits), SVE_VL7, multiplier);
(masm.*fn)(z22.WithLaneSize(lane_size_in_bits), SVE_VL8, multiplier);
(masm.*fn)(z23.WithLaneSize(lane_size_in_bits), SVE_VL16, multiplier);
(masm.*fn)(z24.WithLaneSize(lane_size_in_bits), SVE_VL64, multiplier);
(masm.*fn)(z25.WithLaneSize(lane_size_in_bits), SVE_VL256, multiplier);
(masm.*fn)(z26.WithLaneSize(lane_size_in_bits), 16, multiplier);
(masm.*fn)(z27.WithLaneSize(lane_size_in_bits), 28, multiplier);
(masm.*fn)(z28.WithLaneSize(lane_size_in_bits), SVE_MUL3, multiplier);
(masm.*fn)(z29.WithLaneSize(lane_size_in_bits), SVE_MUL4, multiplier);
(masm.*fn)(z30.WithLaneSize(lane_size_in_bits), SVE_ALL, multiplier);
// Perform computation using alternative instructions.
(masm.*cnt)(x0, SVE_POW2, multiplier);
(masm.*cnt)(x1, SVE_VL1, multiplier);
(masm.*cnt)(x2, SVE_VL2, multiplier);
(masm.*cnt)(x3, SVE_VL3, multiplier);
(masm.*cnt)(x4, SVE_VL4, multiplier);
(masm.*cnt)(x5, SVE_VL7, multiplier);
(masm.*cnt)(x6, SVE_VL8, multiplier);
(masm.*cnt)(x7, SVE_VL16, multiplier);
(masm.*cnt)(x8, SVE_VL64, multiplier);
(masm.*cnt)(x9, SVE_VL256, multiplier);
(masm.*cnt)(x10, 16, multiplier);
(masm.*cnt)(x11, 28, multiplier);
(masm.*cnt)(x12, SVE_MUL3, multiplier);
(masm.*cnt)(x13, SVE_MUL4, multiplier);
(masm.*cnt)(x14, SVE_ALL, multiplier);
ZRegister zscratch = z15.WithLaneSize(lane_size_in_bits);
for (unsigned i = 0; i < 15; i++) {
ZRegister zsrcdst = ZRegister(i, lane_size_in_bits);
Register x = Register(i, kXRegSize);
__ Dup(zscratch, x);
(masm.*addsub)(zsrcdst, zsrcdst, zscratch);
}
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z0, z16);
ASSERT_EQUAL_SVE(z1, z17);
ASSERT_EQUAL_SVE(z2, z18);
ASSERT_EQUAL_SVE(z3, z19);
ASSERT_EQUAL_SVE(z4, z20);
ASSERT_EQUAL_SVE(z5, z21);
ASSERT_EQUAL_SVE(z6, z22);
ASSERT_EQUAL_SVE(z7, z23);
ASSERT_EQUAL_SVE(z8, z24);
ASSERT_EQUAL_SVE(z9, z25);
ASSERT_EQUAL_SVE(z10, z26);
ASSERT_EQUAL_SVE(z11, z27);
ASSERT_EQUAL_SVE(z12, z28);
ASSERT_EQUAL_SVE(z13, z29);
ASSERT_EQUAL_SVE(z14, z30);
}
}
TEST_SVE(sve_inc_dec_vec) {
CntFn cnth = &MacroAssembler::Cnth;
CntFn cntw = &MacroAssembler::Cntw;
CntFn cntd = &MacroAssembler::Cntd;
AddSubFn sub = &MacroAssembler::Sub;
AddSubFn add = &MacroAssembler::Add;
for (int mult = 1; mult <= 16; mult += 5) {
IncDecZHelper(config, &MacroAssembler::Dech, cnth, sub, mult, kHRegSize);
IncDecZHelper(config, &MacroAssembler::Decw, cntw, sub, mult, kSRegSize);
IncDecZHelper(config, &MacroAssembler::Decd, cntd, sub, mult, kDRegSize);
IncDecZHelper(config, &MacroAssembler::Inch, cnth, add, mult, kHRegSize);
IncDecZHelper(config, &MacroAssembler::Incw, cntw, add, mult, kSRegSize);
IncDecZHelper(config, &MacroAssembler::Incd, cntd, add, mult, kDRegSize);
}
}
TEST_SVE(sve_unsigned_sat_inc_dec_vec) {
CntFn cnth = &MacroAssembler::Cnth;
CntFn cntw = &MacroAssembler::Cntw;
CntFn cntd = &MacroAssembler::Cntd;
AddSubFn sub = &MacroAssembler::Uqsub;
AddSubFn add = &MacroAssembler::Uqadd;
for (int mult = 1; mult <= 16; mult += 5) {
IncDecZHelper(config, &MacroAssembler::Uqdech, cnth, sub, mult, kHRegSize);
IncDecZHelper(config, &MacroAssembler::Uqdecw, cntw, sub, mult, kSRegSize);
IncDecZHelper(config, &MacroAssembler::Uqdecd, cntd, sub, mult, kDRegSize);
IncDecZHelper(config, &MacroAssembler::Uqinch, cnth, add, mult, kHRegSize);
IncDecZHelper(config, &MacroAssembler::Uqincw, cntw, add, mult, kSRegSize);
IncDecZHelper(config, &MacroAssembler::Uqincd, cntd, add, mult, kDRegSize);
}
}
TEST_SVE(sve_signed_sat_inc_dec_vec) {
CntFn cnth = &MacroAssembler::Cnth;
CntFn cntw = &MacroAssembler::Cntw;
CntFn cntd = &MacroAssembler::Cntd;
AddSubFn sub = &MacroAssembler::Sqsub;
AddSubFn add = &MacroAssembler::Sqadd;
for (int mult = 1; mult <= 16; mult += 5) {
IncDecZHelper(config, &MacroAssembler::Sqdech, cnth, sub, mult, kHRegSize);
IncDecZHelper(config, &MacroAssembler::Sqdecw, cntw, sub, mult, kSRegSize);
IncDecZHelper(config, &MacroAssembler::Sqdecd, cntd, sub, mult, kDRegSize);
IncDecZHelper(config, &MacroAssembler::Sqinch, cnth, add, mult, kHRegSize);
IncDecZHelper(config, &MacroAssembler::Sqincw, cntw, add, mult, kSRegSize);
IncDecZHelper(config, &MacroAssembler::Sqincd, cntd, add, mult, kDRegSize);
}
}
typedef void (MacroAssembler::*ArithPredicatedFn)(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm);
template <typename Td, typename Tg, typename Tn>
static void IntBinArithHelper(Test* config,
ArithPredicatedFn macro,
unsigned lane_size_in_bits,
const Tg& pg_inputs,
const Tn& zn_inputs,
const Tn& zm_inputs,
const Td& zd_expected) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
ZRegister src_a = z30.WithLaneSize(lane_size_in_bits);
ZRegister src_b = z27.WithLaneSize(lane_size_in_bits);
InsrHelper(&masm, src_a, zn_inputs);
InsrHelper(&masm, src_b, zm_inputs);
Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
ZRegister zd_1 = z0.WithLaneSize(lane_size_in_bits);
ZRegister zd_2 = z1.WithLaneSize(lane_size_in_bits);
ZRegister zd_3 = z2.WithLaneSize(lane_size_in_bits);
// `instr` zd(dst), zd(src_a), zn(src_b)
__ Mov(zd_1, src_a);
(masm.*macro)(zd_1, p0.Merging(), zd_1, src_b);
// `instr` zd(dst), zm(src_a), zd(src_b)
// Based on whether zd and zm registers are aliased, the macro of instructions
// (`Instr`) swaps the order of operands if it has the commutative property,
// otherwise, transfer to the reversed `Instr`, such as subr and divr.
__ Mov(zd_2, src_b);
(masm.*macro)(zd_2, p0.Merging(), src_a, zd_2);
// `instr` zd(dst), zm(src_a), zn(src_b)
// The macro of instructions (`Instr`) automatically selects between `instr`
// and movprfx + `instr` based on whether zd and zn registers are aliased.
// A generated movprfx instruction is predicated that using the same
// governing predicate register. In order to keep the result constant,
// initialize the destination register first.
__ Mov(zd_3, src_a);
(masm.*macro)(zd_3, p0.Merging(), src_a, src_b);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zd_expected, zd_1);
for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
if (!core.HasSVELane(zd_1, lane)) break;
if ((pg_inputs[i] & 1) != 0) {
ASSERT_EQUAL_SVE_LANE(zd_expected[i], zd_1, lane);
} else {
ASSERT_EQUAL_SVE_LANE(zn_inputs[i], zd_1, lane);
}
}
ASSERT_EQUAL_SVE(zd_expected, zd_3);
}
}
TEST_SVE(sve_binary_arithmetic_predicated_add) {
// clang-format off
unsigned zn_b[] = {0x00, 0x01, 0x10, 0x81, 0xff, 0x0f, 0x01, 0x7f};
unsigned zm_b[] = {0x00, 0x01, 0x10, 0x00, 0x81, 0x80, 0xff, 0xff};
unsigned zn_h[] = {0x0000, 0x0123, 0x1010, 0x8181, 0xffff, 0x0f0f, 0x0101, 0x7f7f};
unsigned zm_h[] = {0x0000, 0x0123, 0x1010, 0x0000, 0x8181, 0x8080, 0xffff, 0xffff};
unsigned zn_s[] = {0x00000000, 0x01234567, 0x10101010, 0x81818181,
0xffffffff, 0x0f0f0f0f, 0x01010101, 0x7f7f7f7f};
unsigned zm_s[] = {0x00000000, 0x01234567, 0x10101010, 0x00000000,
0x81818181, 0x80808080, 0xffffffff, 0xffffffff};
uint64_t zn_d[] = {0x0000000000000000, 0x0123456789abcdef,
0x1010101010101010, 0x8181818181818181,
0xffffffffffffffff, 0x0f0f0f0f0f0f0f0f,
0x0101010101010101, 0x7f7f7f7fffffffff};
uint64_t zm_d[] = {0x0000000000000000, 0x0123456789abcdef,
0x1010101010101010, 0x0000000000000000,
0x8181818181818181, 0x8080808080808080,
0xffffffffffffffff, 0xffffffffffffffff};
int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
unsigned add_exp_b[] = {0x00, 0x02, 0x20, 0x81, 0x80, 0x8f, 0x00, 0x7f};
unsigned add_exp_h[] = {0x0000, 0x0246, 0x1010, 0x8181,
0x8180, 0x8f8f, 0x0101, 0x7f7e};
unsigned add_exp_s[] = {0x00000000, 0x01234567, 0x20202020, 0x81818181,
0x81818180, 0x0f0f0f0f, 0x01010100, 0x7f7f7f7e};
uint64_t add_exp_d[] = {0x0000000000000000, 0x02468acf13579bde,
0x2020202020202020, 0x8181818181818181,
0xffffffffffffffff, 0x8f8f8f8f8f8f8f8f,
0x0101010101010100, 0x7f7f7f7ffffffffe};
ArithPredicatedFn fn = &MacroAssembler::Add;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, add_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, add_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, add_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, add_exp_d);
unsigned sub_exp_b[] = {0x00, 0x00, 0x00, 0x81, 0x7e, 0x8f, 0x02, 0x7f};
unsigned sub_exp_h[] = {0x0000, 0x0000, 0x1010, 0x8181,
0x7e7e, 0x8e8f, 0x0101, 0x7f80};
unsigned sub_exp_s[] = {0x00000000, 0x01234567, 0x00000000, 0x81818181,
0x7e7e7e7e, 0x0f0f0f0f, 0x01010102, 0x7f7f7f80};
uint64_t sub_exp_d[] = {0x0000000000000000, 0x0000000000000000,
0x0000000000000000, 0x8181818181818181,
0xffffffffffffffff, 0x8e8e8e8e8e8e8e8f,
0x0101010101010102, 0x7f7f7f8000000000};
fn = &MacroAssembler::Sub;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sub_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sub_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sub_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sub_exp_d);
// clang-format on
}
TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd) {
// clang-format off
unsigned zn_b[] = {0x00, 0xff, 0x0f, 0xff, 0xf0, 0x98, 0x55, 0x67};
unsigned zm_b[] = {0x01, 0x00, 0x0e, 0xfe, 0xfe, 0xab, 0xcd, 0x78};
unsigned zn_h[] = {0x0000, 0xffff, 0x00ff, 0xffff,
0xff00, 0xba98, 0x5555, 0x4567};
unsigned zm_h[] = {0x0001, 0x0000, 0x00ee, 0xfffe,
0xfe00, 0xabab, 0xcdcd, 0x5678};
unsigned zn_s[] = {0x00000000, 0xffffffff, 0x0000ffff, 0xffffffff,
0xffff0000, 0xfedcba98, 0x55555555, 0x01234567};
unsigned zm_s[] = {0x00000001, 0x00000000, 0x0000eeee, 0xfffffffe,
0xfffe0000, 0xabababab, 0xcdcdcdcd, 0x12345678};
uint64_t zn_d[] = {0x0000000000000000, 0xffffffffffffffff,
0x5555555555555555, 0x0000000001234567};
uint64_t zm_d[] = {0x0000000000000001, 0x0000000000000000,
0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
int pg_d[] = {1, 0, 1, 1};
unsigned umax_exp_b[] = {0x01, 0xff, 0x0f, 0xff, 0xfe, 0xab, 0xcd, 0x67};
unsigned umax_exp_h[] = {0x0001, 0xffff, 0x00ff, 0xffff,
0xff00, 0xba98, 0x5555, 0x5678};
unsigned umax_exp_s[] = {0x00000001, 0xffffffff, 0x0000ffff, 0xffffffff,
0xffff0000, 0xfedcba98, 0xcdcdcdcd, 0x12345678};
uint64_t umax_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
ArithPredicatedFn fn = &MacroAssembler::Umax;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umax_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umax_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umax_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umax_exp_d);
unsigned umin_exp_b[] = {0x00, 0x00, 0x0e, 0xff, 0xf0, 0x98, 0x55, 0x67};
unsigned umin_exp_h[] = {0x0000, 0x0000, 0x00ff, 0xfffe,
0xfe00, 0xabab, 0x5555, 0x4567};
unsigned umin_exp_s[] = {0x00000000, 0xffffffff, 0x0000eeee, 0xfffffffe,
0xfffe0000, 0xfedcba98, 0x55555555, 0x01234567};
uint64_t umin_exp_d[] = {0x0000000000000000, 0xffffffffffffffff,
0x5555555555555555, 0x0000000001234567};
fn = &MacroAssembler::Umin;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umin_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umin_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umin_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umin_exp_d);
unsigned uabd_exp_b[] = {0x01, 0xff, 0x01, 0xff, 0x0e, 0x13, 0x78, 0x67};
unsigned uabd_exp_h[] = {0x0001, 0xffff, 0x00ff, 0x0001,
0x0100, 0x0eed, 0x5555, 0x1111};
unsigned uabd_exp_s[] = {0x00000001, 0xffffffff, 0x00001111, 0x00000001,
0x00010000, 0xfedcba98, 0x78787878, 0x11111111};
uint64_t uabd_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
0x7878787878787878, 0x0000000011111111};
fn = &MacroAssembler::Uabd;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, uabd_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, uabd_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, uabd_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, uabd_exp_d);
// clang-format on
}
TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd) {
// clang-format off
int zn_b[] = {0, -128, -128, -128, -128, 127, 127, 1};
int zm_b[] = {-1, 0, -1, -127, 127, 126, -1, 0};
int zn_h[] = {0, INT16_MIN, INT16_MIN, INT16_MIN,
INT16_MIN, INT16_MAX, INT16_MAX, 1};
int zm_h[] = {-1, 0, -1, INT16_MIN + 1,
INT16_MAX, INT16_MAX - 1, -1, 0};
int zn_s[] = {0, INT32_MIN, INT32_MIN, INT32_MIN,
INT32_MIN, INT32_MAX, INT32_MAX, 1};
int zm_s[] = {-1, 0, -1, -INT32_MAX,
INT32_MAX, INT32_MAX - 1, -1, 0};
int64_t zn_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
INT64_MIN, INT64_MAX, INT64_MAX, 1};
int64_t zm_d[] = {-1, 0, -1, INT64_MIN + 1,
INT64_MAX, INT64_MAX - 1, -1, 0};
int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
int smax_exp_b[] = {0, 0, -1, -128, 127, 127, 127, 1};
int smax_exp_h[] = {0, 0, INT16_MIN, INT16_MIN + 1,
INT16_MAX, INT16_MAX, INT16_MAX, 1};
int smax_exp_s[] = {0, INT32_MIN, -1, INT32_MIN + 1,
INT32_MAX, INT32_MAX, INT32_MAX, 1};
int64_t smax_exp_d[] = {0, 0, -1, INT64_MIN + 1,
INT64_MIN, INT64_MAX, INT64_MAX, 1};
ArithPredicatedFn fn = &MacroAssembler::Smax;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smax_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smax_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smax_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smax_exp_d);
int smin_exp_b[] = {-1, -128, -128, -128, -128, 126, -1, 1};
int smin_exp_h[] = {-1, INT16_MIN, INT16_MIN, INT16_MIN,
INT16_MIN, INT16_MAX - 1, INT16_MAX, 0};
int smin_exp_s[] = {-1, INT32_MIN, INT32_MIN, INT32_MIN,
INT32_MIN, INT32_MAX, -1, 0};
int64_t smin_exp_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
INT64_MIN, INT64_MAX - 1, -1, 0};
fn = &MacroAssembler::Smin;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smin_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smin_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smin_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smin_exp_d);
unsigned sabd_exp_b[] = {1, 128, 127, 128, 255, 1, 128, 1};
unsigned sabd_exp_h[] = {1, 0x8000, 0x8000, 1, 0xffff, 1, 0x7fff, 1};
unsigned sabd_exp_s[] = {1, 0x80000000, 0x7fffffff, 1,
0xffffffff, 0x7fffffff, 0x80000000, 1};
uint64_t sabd_exp_d[] = {0, 0x8000000000000000, 0x7fffffffffffffff, 1,
0x8000000000000000, 1, 0x8000000000000000, 1};
fn = &MacroAssembler::Sabd;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sabd_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sabd_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sabd_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sabd_exp_d);
// clang-format on
}
TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh) {
// clang-format off
unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
unsigned zn_h[] = {0x0000, 0x0001, 0x0020, 0x0800,
0x8000, 0xff00, 0x5555, 0xaaaa};
unsigned zm_h[] = {0x007f, 0x00cd, 0x0800, 0xffff,
0x5555, 0xaaaa, 0x0001, 0x1234};
unsigned zn_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
0x12345678, 0xffffffff, 0x55555555, 0xaaaaaaaa};
unsigned zm_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
0x12345678, 0x22223333, 0x55556666, 0x77778888};
uint64_t zn_d[] = {0x0000000000000000, 0x5555555555555555,
0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa};
uint64_t zm_d[] = {0x0000000000000000, 0x1111111133333333,
0xddddddddeeeeeeee, 0xaaaaaaaaaaaaaaaa};
int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
int pg_d[] = {1, 1, 0, 1};
unsigned mul_exp_b[] = {0x00, 0xcd, 0x00, 0xf8, 0x80, 0x56, 0x00, 0x50};
unsigned mul_exp_h[] = {0x0000, 0x0001, 0x0000, 0xf800,
0x8000, 0xff00, 0x5555, 0x9e88};
unsigned mul_exp_s[] = {0x00000000, 0x00000001, 0x00200020, 0x00400000,
0x1df4d840, 0xddddcccd, 0x55555555, 0xb05afa50};
uint64_t mul_exp_d[] = {0x0000000000000000, 0xa4fa4fa4eeeeeeef,
0xffffffffffffffff, 0x38e38e38e38e38e4};
ArithPredicatedFn fn = &MacroAssembler::Mul;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, mul_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, mul_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, mul_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, mul_exp_d);
unsigned umulh_exp_b[] = {0x00, 0x00, 0x10, 0x07, 0x80, 0xa9, 0x00, 0x05};
unsigned umulh_exp_h[] = {0x0000, 0x0001, 0x0001, 0x07ff,
0x2aaa, 0xff00, 0x0000, 0x0c22};
unsigned umulh_exp_s[] = {0x00000000, 0x00000000, 0x00200020, 0x00400080,
0x014b66dc, 0x22223332, 0x55555555, 0x4fa505af};
uint64_t umulh_exp_d[] = {0x0000000000000000, 0x05b05b05bbbbbbbb,
0xffffffffffffffff, 0x71c71c71c71c71c6};
fn = &MacroAssembler::Umulh;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umulh_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umulh_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umulh_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umulh_exp_d);
// clang-format on
}
TEST_SVE(sve_binary_arithmetic_predicated_smulh) {
// clang-format off
int zn_b[] = {0, 1, -1, INT8_MIN, INT8_MAX, -1, 100, -3};
int zm_b[] = {0, INT8_MIN, INT8_MIN, INT8_MAX, INT8_MAX, -1, 2, 66};
int zn_h[] = {0, 1, -1, INT16_MIN, INT16_MAX, -1, 10000, -3};
int zm_h[] = {0, INT16_MIN, INT16_MIN, INT16_MAX, INT16_MAX, -1, 2, 6666};
int zn_s[] = {0, 1, -1, INT32_MIN, INT32_MAX, -1, 100000000, -3};
int zm_s[] = {0, INT32_MIN, INT32_MIN, INT32_MAX, INT32_MAX, -1, 2, 66666666};
int64_t zn_d[] = {0, -1, INT64_MIN, INT64_MAX};
int64_t zm_d[] = {INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX};
int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
int pg_d[] = {1, 1, 0, 1};
int exp_b[] = {0, -1, 0, -64, INT8_MAX, 0, 0, -1};
int exp_h[] = {0, 1, 0, -16384, 16383, -1, 0, -1};
int exp_s[] = {0, -1, -1, -1073741824, 1073741823, 0, 100000000, -1};
int64_t exp_d[] = {0, -1, INT64_MIN, 4611686018427387903};
ArithPredicatedFn fn = &MacroAssembler::Smulh;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
// clang-format on
}
TEST_SVE(sve_binary_arithmetic_predicated_logical) {
// clang-format off
unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
unsigned zn_h[] = {0x0000, 0x0001, 0x2020, 0x0008,
0x8000, 0xffff, 0x5555, 0xaaaa};
unsigned zm_h[] = {0x7fff, 0xabcd, 0x8000, 0xffff,
0x5555, 0xaaaa, 0x0000, 0x0800};
unsigned zn_s[] = {0x00000001, 0x20200008, 0x8000ffff, 0x5555aaaa};
unsigned zm_s[] = {0x7fffabcd, 0x8000ffff, 0x5555aaaa, 0x00000800};
uint64_t zn_d[] = {0xfedcba9876543210, 0x0123456789abcdef,
0x0001200880ff55aa, 0x0022446688aaccee};
uint64_t zm_d[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff,
0x7fcd80ff55aa0008, 0x1133557799bbddff};
int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
int pg_s[] = {1, 1, 1, 0};
int pg_d[] = {1, 1, 0, 1};
unsigned and_exp_b[] = {0x00, 0x01, 0x00, 0x08, 0x80, 0xaa, 0x00, 0x08};
unsigned and_exp_h[] = {0x0000, 0x0001, 0x0000, 0x0008,
0x0000, 0xffff, 0x0000, 0x0800};
unsigned and_exp_s[] = {0x00000001, 0x00000008, 0x0000aaaa, 0x5555aaaa};
uint64_t and_exp_d[] = {0xfedcaa8854540000, 0x0000454588aacdef,
0x0001200880ff55aa, 0x0022446688aaccee};
ArithPredicatedFn fn = &MacroAssembler::And;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, and_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, and_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, and_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, and_exp_d);
unsigned bic_exp_b[] = {0x00, 0x00, 0x20, 0x00, 0x80, 0x55, 0x55, 0xa2};
unsigned bic_exp_h[] = {0x0000, 0x0001, 0x2020, 0x0000,
0x8000, 0xffff, 0x5555, 0xa2aa};
unsigned bic_exp_s[] = {0x00000000, 0x20200000, 0x80005555, 0x5555aaaa};
uint64_t bic_exp_d[] = {0x0000101022003210, 0x0123002201010000,
0x0001200880ff55aa, 0x0000000000000000};
fn = &MacroAssembler::Bic;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, bic_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, bic_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, bic_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, bic_exp_d);
unsigned eor_exp_b[] = {0x00, 0xcc, 0xa0, 0xf7, 0x80, 0x55, 0x55, 0xa2};
unsigned eor_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xfff7,
0xd555, 0xffff, 0x5555, 0xa2aa};
unsigned eor_exp_s[] = {0x7fffabcc, 0xa020fff7, 0xd5555555, 0x5555aaaa};
uint64_t eor_exp_d[] = {0x01235476ab89fedc, 0xcdef98ba67453210,
0x0001200880ff55aa, 0x1111111111111111};
fn = &MacroAssembler::Eor;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, eor_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, eor_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, eor_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, eor_exp_d);
unsigned orr_exp_b[] = {0x00, 0xcd, 0xa0, 0xff, 0x80, 0xff, 0x55, 0xaa};
unsigned orr_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xffff,
0xd555, 0xffff, 0x5555, 0xaaaa};
unsigned orr_exp_s[] = {0x7fffabcd, 0xa020ffff, 0xd555ffff, 0x5555aaaa};
uint64_t orr_exp_d[] = {0xfffffefeffddfedc, 0xcdefddffefefffff,
0x0001200880ff55aa, 0x1133557799bbddff};
fn = &MacroAssembler::Orr;
IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, orr_exp_b);
IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, orr_exp_h);
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, orr_exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, orr_exp_d);
// clang-format on
}
TEST_SVE(sve_binary_arithmetic_predicated_sdiv) {
// clang-format off
int zn_s[] = {0, 1, -1, 2468,
INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX,
-11111111, 87654321, 0, 0};
int zm_s[] = {1, -1, 1, 1234,
-1, INT32_MIN, 1, -1,
22222222, 80000000, -1, 0};
int64_t zn_d[] = {0, 1, -1, 2468,
INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX,
-11111111, 87654321, 0, 0};
int64_t zm_d[] = {1, -1, 1, 1234,
-1, INT64_MIN, 1, -1,
22222222, 80000000, -1, 0};
int pg_s[] = {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0};
int pg_d[] = {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1};
int exp_s[] = {0, 1, -1, 2,
INT32_MIN, 0, INT32_MIN, -INT32_MAX,
0, 1, 0, 0};
int64_t exp_d[] = {0, -1, -1, 2,
INT64_MIN, INT64_MAX, INT64_MIN, -INT64_MAX,
0, 1, 0, 0};
ArithPredicatedFn fn = &MacroAssembler::Sdiv;
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
// clang-format on
}
TEST_SVE(sve_binary_arithmetic_predicated_udiv) {
// clang-format off
unsigned zn_s[] = {0x00000000, 0x00000001, 0xffffffff, 0x80000000,
0xffffffff, 0x80000000, 0xffffffff, 0x0000f000};
unsigned zm_s[] = {0x00000001, 0xffffffff, 0x80000000, 0x00000002,
0x00000000, 0x00000001, 0x00008000, 0xf0000000};
uint64_t zn_d[] = {0x0000000000000000, 0x0000000000000001,
0xffffffffffffffff, 0x8000000000000000,
0xffffffffffffffff, 0x8000000000000000,
0xffffffffffffffff, 0xf0000000f0000000};
uint64_t zm_d[] = {0x0000000000000001, 0xffffffff00000000,
0x8000000000000000, 0x0000000000000002,
0x8888888888888888, 0x0000000000000001,
0x0000000080000000, 0x00000000f0000000};
int pg_s[] = {1, 1, 0, 1, 1, 0, 1, 1};
int pg_d[] = {1, 0, 1, 1, 1, 1, 0, 1};
unsigned exp_s[] = {0x00000000, 0x00000000, 0xffffffff, 0x40000000,
0x00000000, 0x80000000, 0x0001ffff, 0x00000000};
uint64_t exp_d[] = {0x0000000000000000, 0x0000000000000001,
0x0000000000000001, 0x4000000000000000,
0x0000000000000001, 0x8000000000000000,
0xffffffffffffffff, 0x0000000100000001};
ArithPredicatedFn fn = &MacroAssembler::Udiv;
IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
// clang-format on
}
typedef void (MacroAssembler::*ArithFn)(const ZRegister& zd,
const ZRegister& zn,
const ZRegister& zm);
template <typename T>
static void IntArithHelper(Test* config,
ArithFn macro,
unsigned lane_size_in_bits,
const T& zn_inputs,
const T& zm_inputs,
const T& zd_expected) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
ZRegister zn = z31.WithLaneSize(lane_size_in_bits);
ZRegister zm = z27.WithLaneSize(lane_size_in_bits);
InsrHelper(&masm, zn, zn_inputs);
InsrHelper(&masm, zm, zm_inputs);
ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
(masm.*macro)(zd, zn, zm);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zd_expected, zd);
}
}
TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd) {
// clang-format off
unsigned in_b[] = {0x81, 0x7f, 0x10, 0xaa, 0x55, 0xff, 0xf0};
unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa, 0x5555, 0xffff, 0xf0f0};
unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0x10001010, 0xaaaaaaaa, 0xf000f0f0};
uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
0x1000000010001010, 0xf0000000f000f0f0};
ArithFn fn = &MacroAssembler::Add;
unsigned add_exp_b[] = {0x02, 0xfe, 0x20, 0x54, 0xaa, 0xfe, 0xe0};
unsigned add_exp_h[] = {0x0302, 0xfefe, 0x2020, 0x5554, 0xaaaa, 0xfffe, 0xe1e0};
unsigned add_exp_s[] = {0x00030302, 0xfffefefe, 0x20002020, 0x55555554, 0xe001e1e0};
uint64_t add_exp_d[] = {0x0000000300030302, 0xfffffffefffefefe,
0x2000000020002020, 0xe0000001e001e1e0};
IntArithHelper(config, fn, kBRegSize, in_b, in_b, add_exp_b);
IntArithHelper(config, fn, kHRegSize, in_h, in_h, add_exp_h);
IntArithHelper(config, fn, kSRegSize, in_s, in_s, add_exp_s);
IntArithHelper(config, fn, kDRegSize, in_d, in_d, add_exp_d);
fn = &MacroAssembler::Sqadd;
unsigned sqadd_exp_b[] = {0x80, 0x7f, 0x20, 0x80, 0x7f, 0xfe, 0xe0};
unsigned sqadd_exp_h[] = {0x8000, 0x7fff, 0x2020, 0x8000, 0x7fff, 0xfffe, 0xe1e0};
unsigned sqadd_exp_s[] = {0x80000000, 0x7fffffff, 0x20002020, 0x80000000, 0xe001e1e0};
uint64_t sqadd_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
0x2000000020002020, 0xe0000001e001e1e0};
IntArithHelper(config, fn, kBRegSize, in_b, in_b, sqadd_exp_b);
IntArithHelper(config, fn, kHRegSize, in_h, in_h, sqadd_exp_h);
IntArithHelper(config, fn, kSRegSize, in_s, in_s, sqadd_exp_s);
IntArithHelper(config, fn, kDRegSize, in_d, in_d, sqadd_exp_d);
fn = &MacroAssembler::Uqadd;
unsigned uqadd_exp_b[] = {0xff, 0xfe, 0x20, 0xff, 0xaa, 0xff, 0xff};
unsigned uqadd_exp_h[] = {0xffff, 0xfefe, 0x2020, 0xffff, 0xaaaa, 0xffff, 0xffff};
unsigned uqadd_exp_s[] = {0xffffffff, 0xfffefefe, 0x20002020, 0xffffffff, 0xffffffff};
uint64_t uqadd_exp_d[] = {0xffffffffffffffff, 0xfffffffefffefefe,
0x2000000020002020, 0xffffffffffffffff};
IntArithHelper(config, fn, kBRegSize, in_b, in_b, uqadd_exp_b);
IntArithHelper(config, fn, kHRegSize, in_h, in_h, uqadd_exp_h);
IntArithHelper(config, fn, kSRegSize, in_s, in_s, uqadd_exp_s);
IntArithHelper(config, fn, kDRegSize, in_d, in_d, uqadd_exp_d);
// clang-format on
}
TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub) {
// clang-format off
unsigned ins1_b[] = {0x81, 0x7f, 0x7e, 0xaa};
unsigned ins2_b[] = {0x10, 0xf0, 0xf0, 0x55};
unsigned ins1_h[] = {0x8181, 0x7f7f, 0x7e7e, 0xaaaa};
unsigned ins2_h[] = {0x1010, 0xf0f0, 0xf0f0, 0x5555};
unsigned ins1_s[] = {0x80018181, 0x7fff7f7f, 0x7eee7e7e, 0xaaaaaaaa};
unsigned ins2_s[] = {0x10001010, 0xf000f0f0, 0xf000f0f0, 0x55555555};
uint64_t ins1_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
0x7eeeeeee7eee7e7e, 0xaaaaaaaaaaaaaaaa};
uint64_t ins2_d[] = {0x1000000010001010, 0xf0000000f000f0f0,
0xf0000000f000f0f0, 0x5555555555555555};
ArithFn fn = &MacroAssembler::Sub;
unsigned ins1_sub_ins2_exp_b[] = {0x71, 0x8f, 0x8e, 0x55};
unsigned ins1_sub_ins2_exp_h[] = {0x7171, 0x8e8f, 0x8d8e, 0x5555};
unsigned ins1_sub_ins2_exp_s[] = {0x70017171, 0x8ffe8e8f, 0x8eed8d8e, 0x55555555};
uint64_t ins1_sub_ins2_exp_d[] = {0x7000000170017171, 0x8ffffffe8ffe8e8f,
0x8eeeeeed8eed8d8e, 0x5555555555555555};
IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sub_ins2_exp_b);
IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sub_ins2_exp_h);
IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sub_ins2_exp_s);
IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sub_ins2_exp_d);
unsigned ins2_sub_ins1_exp_b[] = {0x8f, 0x71, 0x72, 0xab};
unsigned ins2_sub_ins1_exp_h[] = {0x8e8f, 0x7171, 0x7272, 0xaaab};
unsigned ins2_sub_ins1_exp_s[] = {0x8ffe8e8f, 0x70017171, 0x71127272, 0xaaaaaaab};
uint64_t ins2_sub_ins1_exp_d[] = {0x8ffffffe8ffe8e8f, 0x7000000170017171,
0x7111111271127272, 0xaaaaaaaaaaaaaaab};
IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sub_ins1_exp_b);
IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sub_ins1_exp_h);
IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sub_ins1_exp_s);
IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sub_ins1_exp_d);
fn = &MacroAssembler::Sqsub;
unsigned ins1_sqsub_ins2_exp_b[] = {0x80, 0x7f, 0x7f, 0x80};
unsigned ins1_sqsub_ins2_exp_h[] = {0x8000, 0x7fff, 0x7fff, 0x8000};
unsigned ins1_sqsub_ins2_exp_s[] = {0x80000000, 0x7fffffff, 0x7fffffff, 0x80000000};
uint64_t ins1_sqsub_ins2_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
0x7fffffffffffffff, 0x8000000000000000};
IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sqsub_ins2_exp_b);
IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sqsub_ins2_exp_h);
IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sqsub_ins2_exp_s);
IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sqsub_ins2_exp_d);
unsigned ins2_sqsub_ins1_exp_b[] = {0x7f, 0x80, 0x80, 0x7f};
unsigned ins2_sqsub_ins1_exp_h[] = {0x7fff, 0x8000, 0x8000, 0x7fff};
unsigned ins2_sqsub_ins1_exp_s[] = {0x7fffffff, 0x80000000, 0x80000000, 0x7fffffff};
uint64_t ins2_sqsub_ins1_exp_d[] = {0x7fffffffffffffff, 0x8000000000000000,
0x8000000000000000, 0x7fffffffffffffff};
IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sqsub_ins1_exp_b);
IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sqsub_ins1_exp_h);
IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sqsub_ins1_exp_s);
IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sqsub_ins1_exp_d);
fn = &MacroAssembler::Uqsub;
unsigned ins1_uqsub_ins2_exp_b[] = {0x71, 0x00, 0x00, 0x55};
unsigned ins1_uqsub_ins2_exp_h[] = {0x7171, 0x0000, 0x0000, 0x5555};
unsigned ins1_uqsub_ins2_exp_s[] = {0x70017171, 0x00000000, 0x00000000, 0x55555555};
uint64_t ins1_uqsub_ins2_exp_d[] = {0x7000000170017171, 0x0000000000000000,
0x0000000000000000, 0x5555555555555555};
IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_uqsub_ins2_exp_b);
IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_uqsub_ins2_exp_h);
IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_uqsub_ins2_exp_s);
IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_uqsub_ins2_exp_d);
unsigned ins2_uqsub_ins1_exp_b[] = {0x00, 0x71, 0x72, 0x00};
unsigned ins2_uqsub_ins1_exp_h[] = {0x0000, 0x7171, 0x7272, 0x0000};
unsigned ins2_uqsub_ins1_exp_s[] = {0x00000000, 0x70017171, 0x71127272, 0x00000000};
uint64_t ins2_uqsub_ins1_exp_d[] = {0x0000000000000000, 0x7000000170017171,
0x7111111271127272, 0x0000000000000000};
IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_uqsub_ins1_exp_b);
IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_uqsub_ins1_exp_h);
IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_uqsub_ins1_exp_s);
IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_uqsub_ins1_exp_d);
// clang-format on
}
TEST_SVE(sve_rdvl) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Encodable multipliers.
__ Rdvl(x0, 0);
__ Rdvl(x1, 1);
__ Rdvl(x2, 2);
__ Rdvl(x3, 31);
__ Rdvl(x4, -1);
__ Rdvl(x5, -2);
__ Rdvl(x6, -32);
// For unencodable multipliers, the MacroAssembler uses a sequence of
// instructions.
__ Rdvl(x10, 32);
__ Rdvl(x11, -33);
__ Rdvl(x12, 42);
__ Rdvl(x13, -42);
// The maximum value of VL is 256 (bytes), so the multiplier is limited to the
// range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
// occurs in the macro.
__ Rdvl(x14, 0x007fffffffffffff);
__ Rdvl(x15, -0x0080000000000000);
END();
if (CAN_RUN()) {
RUN();
uint64_t vl = config->sve_vl_in_bytes();
ASSERT_EQUAL_64(vl * 0, x0);
ASSERT_EQUAL_64(vl * 1, x1);
ASSERT_EQUAL_64(vl * 2, x2);
ASSERT_EQUAL_64(vl * 31, x3);
ASSERT_EQUAL_64(vl * -1, x4);
ASSERT_EQUAL_64(vl * -2, x5);
ASSERT_EQUAL_64(vl * -32, x6);
ASSERT_EQUAL_64(vl * 32, x10);
ASSERT_EQUAL_64(vl * -33, x11);
ASSERT_EQUAL_64(vl * 42, x12);
ASSERT_EQUAL_64(vl * -42, x13);
ASSERT_EQUAL_64(vl * 0x007fffffffffffff, x14);
ASSERT_EQUAL_64(vl * 0xff80000000000000, x15);
}
}
TEST_SVE(sve_rdpl) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// There is no `rdpl` instruction, so the MacroAssembler maps `Rdpl` onto
// Addpl(xd, xzr, ...).
// Encodable multipliers (as `addvl`).
__ Rdpl(x0, 0);
__ Rdpl(x1, 8);
__ Rdpl(x2, 248);
__ Rdpl(x3, -8);
__ Rdpl(x4, -256);
// Encodable multipliers (as `movz` + `addpl`).
__ Rdpl(x7, 31);
__ Rdpl(x8, -31);
// For unencodable multipliers, the MacroAssembler uses a sequence of
// instructions.
__ Rdpl(x10, 42);
__ Rdpl(x11, -42);
// The maximum value of VL is 256 (bytes), so the multiplier is limited to the
// range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
// occurs in the macro.
__ Rdpl(x12, 0x007fffffffffffff);
__ Rdpl(x13, -0x0080000000000000);
END();
if (CAN_RUN()) {
RUN();
uint64_t vl = config->sve_vl_in_bytes();
VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
uint64_t pl = vl / kZRegBitsPerPRegBit;
ASSERT_EQUAL_64(pl * 0, x0);
ASSERT_EQUAL_64(pl * 8, x1);
ASSERT_EQUAL_64(pl * 248, x2);
ASSERT_EQUAL_64(pl * -8, x3);
ASSERT_EQUAL_64(pl * -256, x4);
ASSERT_EQUAL_64(pl * 31, x7);
ASSERT_EQUAL_64(pl * -31, x8);
ASSERT_EQUAL_64(pl * 42, x10);
ASSERT_EQUAL_64(pl * -42, x11);
ASSERT_EQUAL_64(pl * 0x007fffffffffffff, x12);
ASSERT_EQUAL_64(pl * 0xff80000000000000, x13);
}
}
TEST_SVE(sve_addvl) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t base = 0x1234567800000000;
__ Mov(x30, base);
// Encodable multipliers.
__ Addvl(x0, x30, 0);
__ Addvl(x1, x30, 1);
__ Addvl(x2, x30, 31);
__ Addvl(x3, x30, -1);
__ Addvl(x4, x30, -32);
// For unencodable multipliers, the MacroAssembler uses `Rdvl` and `Add`.
__ Addvl(x5, x30, 32);
__ Addvl(x6, x30, -33);
// Test the limits of the multiplier supported by the `Rdvl` macro.
__ Addvl(x7, x30, 0x007fffffffffffff);
__ Addvl(x8, x30, -0x0080000000000000);
// Check that xzr behaves correctly.
__ Addvl(x9, xzr, 8);
__ Addvl(x10, xzr, 42);
// Check that sp behaves correctly with encodable and unencodable multipliers.
__ Addvl(sp, sp, -5);
__ Addvl(sp, sp, -37);
__ Addvl(x11, sp, -2);
__ Addvl(sp, x11, 2);
__ Addvl(x12, sp, -42);
// Restore the value of sp.
__ Addvl(sp, x11, 39);
__ Addvl(sp, sp, 5);
// Adjust x11 and x12 to make the test sp-agnostic.
__ Sub(x11, sp, x11);
__ Sub(x12, sp, x12);
// Check cases where xd.Is(xn). This stresses scratch register allocation.
__ Mov(x20, x30);
__ Mov(x21, x30);
__ Mov(x22, x30);
__ Addvl(x20, x20, 4);
__ Addvl(x21, x21, 42);
__ Addvl(x22, x22, -0x0080000000000000);
END();
if (CAN_RUN()) {
RUN();
uint64_t vl = config->sve_vl_in_bytes();
ASSERT_EQUAL_64(base + (vl * 0), x0);
ASSERT_EQUAL_64(base + (vl * 1), x1);
ASSERT_EQUAL_64(base + (vl * 31), x2);
ASSERT_EQUAL_64(base + (vl * -1), x3);
ASSERT_EQUAL_64(base + (vl * -32), x4);
ASSERT_EQUAL_64(base + (vl * 32), x5);
ASSERT_EQUAL_64(base + (vl * -33), x6);
ASSERT_EQUAL_64(base + (vl * 0x007fffffffffffff), x7);
ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x8);
ASSERT_EQUAL_64(vl * 8, x9);
ASSERT_EQUAL_64(vl * 42, x10);
ASSERT_EQUAL_64(vl * 44, x11);
ASSERT_EQUAL_64(vl * 84, x12);
ASSERT_EQUAL_64(base + (vl * 4), x20);
ASSERT_EQUAL_64(base + (vl * 42), x21);
ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x22);
ASSERT_EQUAL_64(base, x30);
}
}
TEST_SVE(sve_addpl) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t base = 0x1234567800000000;
__ Mov(x30, base);
// Encodable multipliers.
__ Addpl(x0, x30, 0);
__ Addpl(x1, x30, 1);
__ Addpl(x2, x30, 31);
__ Addpl(x3, x30, -1);
__ Addpl(x4, x30, -32);
// For unencodable multipliers, the MacroAssembler uses `Addvl` if it can, or
// it falls back to `Rdvl` and `Add`.
__ Addpl(x5, x30, 32);
__ Addpl(x6, x30, -33);
// Test the limits of the multiplier supported by the `Rdvl` macro.
__ Addpl(x7, x30, 0x007fffffffffffff);
__ Addpl(x8, x30, -0x0080000000000000);
// Check that xzr behaves correctly.
__ Addpl(x9, xzr, 8);
__ Addpl(x10, xzr, 42);
// Check that sp behaves correctly with encodable and unencodable multipliers.
__ Addpl(sp, sp, -5);
__ Addpl(sp, sp, -37);
__ Addpl(x11, sp, -2);
__ Addpl(sp, x11, 2);
__ Addpl(x12, sp, -42);
// Restore the value of sp.
__ Addpl(sp, x11, 39);
__ Addpl(sp, sp, 5);
// Adjust x11 and x12 to make the test sp-agnostic.
__ Sub(x11, sp, x11);
__ Sub(x12, sp, x12);
// Check cases where xd.Is(xn). This stresses scratch register allocation.
__ Mov(x20, x30);
__ Mov(x21, x30);
__ Mov(x22, x30);
__ Addpl(x20, x20, 4);
__ Addpl(x21, x21, 42);
__ Addpl(x22, x22, -0x0080000000000000);
END();
if (CAN_RUN()) {
RUN();
uint64_t vl = config->sve_vl_in_bytes();
VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
uint64_t pl = vl / kZRegBitsPerPRegBit;
ASSERT_EQUAL_64(base + (pl * 0), x0);
ASSERT_EQUAL_64(base + (pl * 1), x1);
ASSERT_EQUAL_64(base + (pl * 31), x2);
ASSERT_EQUAL_64(base + (pl * -1), x3);
ASSERT_EQUAL_64(base + (pl * -32), x4);
ASSERT_EQUAL_64(base + (pl * 32), x5);
ASSERT_EQUAL_64(base + (pl * -33), x6);
ASSERT_EQUAL_64(base + (pl * 0x007fffffffffffff), x7);
ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x8);
ASSERT_EQUAL_64(pl * 8, x9);
ASSERT_EQUAL_64(pl * 42, x10);
ASSERT_EQUAL_64(pl * 44, x11);
ASSERT_EQUAL_64(pl * 84, x12);
ASSERT_EQUAL_64(base + (pl * 4), x20);
ASSERT_EQUAL_64(base + (pl * 42), x21);
ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x22);
ASSERT_EQUAL_64(base, x30);
}
}
TEST_SVE(sve_calculate_sve_address) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wshadow"
// Shadow the `MacroAssembler` type so that the test macros work without
// modification.
typedef CalculateSVEAddressMacroAssembler MacroAssembler;
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START(); // NOLINT(clang-diagnostic-local-type-template-args)
uint64_t base = 0x1234567800000000;
__ Mov(x28, base);
__ Mov(x29, 48);
__ Mov(x30, -48);
// Simple scalar (or equivalent) cases.
__ CalculateSVEAddress(x0, SVEMemOperand(x28));
__ CalculateSVEAddress(x1, SVEMemOperand(x28, 0));
__ CalculateSVEAddress(x2, SVEMemOperand(x28, 0, SVE_MUL_VL));
__ CalculateSVEAddress(x3, SVEMemOperand(x28, 0, SVE_MUL_VL), 3);
__ CalculateSVEAddress(x4, SVEMemOperand(x28, xzr));
__ CalculateSVEAddress(x5, SVEMemOperand(x28, xzr, LSL, 42));
// scalar-plus-immediate
// Unscaled immediates, handled with `Add`.
__ CalculateSVEAddress(x6, SVEMemOperand(x28, 42));
__ CalculateSVEAddress(x7, SVEMemOperand(x28, -42));
// Scaled immediates, handled with `Addvl` or `Addpl`.
__ CalculateSVEAddress(x8, SVEMemOperand(x28, 31, SVE_MUL_VL), 0);
__ CalculateSVEAddress(x9, SVEMemOperand(x28, -32, SVE_MUL_VL), 0);
// Out of `addvl` or `addpl` range.
__ CalculateSVEAddress(x10, SVEMemOperand(x28, 42, SVE_MUL_VL), 0);
__ CalculateSVEAddress(x11, SVEMemOperand(x28, -42, SVE_MUL_VL), 0);
// As above, for VL-based accesses smaller than a Z register.
VIXL_STATIC_ASSERT(kZRegBitsPerPRegBitLog2 == 3);
__ CalculateSVEAddress(x12, SVEMemOperand(x28, -32 * 8, SVE_MUL_VL), 3);
__ CalculateSVEAddress(x13, SVEMemOperand(x28, -42 * 8, SVE_MUL_VL), 3);
__ CalculateSVEAddress(x14, SVEMemOperand(x28, -32 * 4, SVE_MUL_VL), 2);
__ CalculateSVEAddress(x15, SVEMemOperand(x28, -42 * 4, SVE_MUL_VL), 2);
__ CalculateSVEAddress(x18, SVEMemOperand(x28, -32 * 2, SVE_MUL_VL), 1);
__ CalculateSVEAddress(x19, SVEMemOperand(x28, -42 * 2, SVE_MUL_VL), 1);
// scalar-plus-scalar
__ CalculateSVEAddress(x20, SVEMemOperand(x28, x29));
__ CalculateSVEAddress(x21, SVEMemOperand(x28, x30));
__ CalculateSVEAddress(x22, SVEMemOperand(x28, x29, LSL, 8));
__ CalculateSVEAddress(x23, SVEMemOperand(x28, x30, LSL, 8));
// In-place updates, to stress scratch register allocation.
__ Mov(x24, 0xabcd000000000000);
__ Mov(x25, 0xabcd101100000000);
__ Mov(x26, 0xabcd202200000000);
__ Mov(x27, 0xabcd303300000000);
__ Mov(x28, 0xabcd404400000000);
__ Mov(x29, 0xabcd505500000000);
__ CalculateSVEAddress(x24, SVEMemOperand(x24));
__ CalculateSVEAddress(x25, SVEMemOperand(x25, 0x42));
__ CalculateSVEAddress(x26, SVEMemOperand(x26, 3, SVE_MUL_VL), 0);
__ CalculateSVEAddress(x27, SVEMemOperand(x27, 0x42, SVE_MUL_VL), 3);
__ CalculateSVEAddress(x28, SVEMemOperand(x28, x30));
__ CalculateSVEAddress(x29, SVEMemOperand(x29, x30, LSL, 4));
END();
if (CAN_RUN()) {
RUN();
uint64_t vl = config->sve_vl_in_bytes();
VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
uint64_t pl = vl / kZRegBitsPerPRegBit;
// Simple scalar (or equivalent) cases.
ASSERT_EQUAL_64(base, x0);
ASSERT_EQUAL_64(base, x1);
ASSERT_EQUAL_64(base, x2);
ASSERT_EQUAL_64(base, x3);
ASSERT_EQUAL_64(base, x4);
ASSERT_EQUAL_64(base, x5);
// scalar-plus-immediate
ASSERT_EQUAL_64(base + 42, x6);
ASSERT_EQUAL_64(base - 42, x7);
ASSERT_EQUAL_64(base + (31 * vl), x8);
ASSERT_EQUAL_64(base - (32 * vl), x9);
ASSERT_EQUAL_64(base + (42 * vl), x10);
ASSERT_EQUAL_64(base - (42 * vl), x11);
ASSERT_EQUAL_64(base - (32 * vl), x12);
ASSERT_EQUAL_64(base - (42 * vl), x13);
ASSERT_EQUAL_64(base - (32 * vl), x14);
ASSERT_EQUAL_64(base - (42 * vl), x15);
ASSERT_EQUAL_64(base - (32 * vl), x18);
ASSERT_EQUAL_64(base - (42 * vl), x19);
// scalar-plus-scalar
ASSERT_EQUAL_64(base + 48, x20);
ASSERT_EQUAL_64(base - 48, x21);
ASSERT_EQUAL_64(base + (48 << 8), x22);
ASSERT_EQUAL_64(base - (48 << 8), x23);
// In-place updates.
ASSERT_EQUAL_64(0xabcd000000000000, x24);
ASSERT_EQUAL_64(0xabcd101100000000 + 0x42, x25);
ASSERT_EQUAL_64(0xabcd202200000000 + (3 * vl), x26);
ASSERT_EQUAL_64(0xabcd303300000000 + (0x42 * pl), x27);
ASSERT_EQUAL_64(0xabcd404400000000 - 48, x28);
ASSERT_EQUAL_64(0xabcd505500000000 - (48 << 4), x29);
}
#pragma GCC diagnostic pop
}
TEST_SVE(sve_permute_vector_unpredicated) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
START();
// Initialise registers with known values first.
__ Dup(z1.VnB(), 0x11);
__ Dup(z2.VnB(), 0x22);
__ Dup(z3.VnB(), 0x33);
__ Dup(z4.VnB(), 0x44);
__ Mov(x0, 0x0123456789abcdef);
__ Fmov(d0, RawbitsToDouble(0x7ffaaaaa22223456));
__ Insr(z1.VnS(), w0);
__ Insr(z2.VnD(), x0);
__ Insr(z3.VnH(), h0);
__ Insr(z4.VnD(), d0);
uint64_t inputs[] = {0xfedcba9876543210,
0x0123456789abcdef,
0x8f8e8d8c8b8a8988,
0x8786858483828180};
// Initialize a distinguishable value throughout the register first.
__ Dup(z9.VnB(), 0xff);
InsrHelper(&masm, z9.VnD(), inputs);
__ Rev(z5.VnB(), z9.VnB());
__ Rev(z6.VnH(), z9.VnH());
__ Rev(z7.VnS(), z9.VnS());
__ Rev(z8.VnD(), z9.VnD());
int index[7] = {22, 7, 7, 3, 1, 1, 63};
// Broadcasting an data within the input array.
__ Dup(z10.VnB(), z9.VnB(), index[0]);
__ Dup(z11.VnH(), z9.VnH(), index[1]);
__ Dup(z12.VnS(), z9.VnS(), index[2]);
__ Dup(z13.VnD(), z9.VnD(), index[3]);
__ Dup(z14.VnQ(), z9.VnQ(), index[4]);
// Test dst == src
__ Mov(z15, z9);
__ Dup(z15.VnS(), z15.VnS(), index[5]);
// Selecting an data beyond the input array.
__ Dup(z16.VnB(), z9.VnB(), index[6]);
END();
if (CAN_RUN()) {
RUN();
// Insr
uint64_t z1_expected[] = {0x1111111111111111, 0x1111111189abcdef};
uint64_t z2_expected[] = {0x2222222222222222, 0x0123456789abcdef};
uint64_t z3_expected[] = {0x3333333333333333, 0x3333333333333456};
uint64_t z4_expected[] = {0x4444444444444444, 0x7ffaaaaa22223456};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
// Rev
int lane_count = core.GetSVELaneCount(kBRegSize);
for (int i = 0; i < lane_count; i++) {
uint64_t expected =
core.zreg_lane(z5.GetCode(), kBRegSize, lane_count - i - 1);
uint64_t input = core.zreg_lane(z9.GetCode(), kBRegSize, i);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kHRegSize);
for (int i = 0; i < lane_count; i++) {
uint64_t expected =
core.zreg_lane(z6.GetCode(), kHRegSize, lane_count - i - 1);
uint64_t input = core.zreg_lane(z9.GetCode(), kHRegSize, i);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kSRegSize);
for (int i = 0; i < lane_count; i++) {
uint64_t expected =
core.zreg_lane(z7.GetCode(), kSRegSize, lane_count - i - 1);
uint64_t input = core.zreg_lane(z9.GetCode(), kSRegSize, i);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kDRegSize);
for (int i = 0; i < lane_count; i++) {
uint64_t expected =
core.zreg_lane(z8.GetCode(), kDRegSize, lane_count - i - 1);
uint64_t input = core.zreg_lane(z9.GetCode(), kDRegSize, i);
ASSERT_EQUAL_64(expected, input);
}
// Dup
unsigned vl = config->sve_vl_in_bits();
lane_count = core.GetSVELaneCount(kBRegSize);
uint64_t expected_z10 = (vl > (index[0] * kBRegSize)) ? 0x23 : 0;
for (int i = 0; i < lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(expected_z10, z10.VnB(), i);
}
lane_count = core.GetSVELaneCount(kHRegSize);
uint64_t expected_z11 = (vl > (index[1] * kHRegSize)) ? 0x8f8e : 0;
for (int i = 0; i < lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(expected_z11, z11.VnH(), i);
}
lane_count = core.GetSVELaneCount(kSRegSize);
uint64_t expected_z12 = (vl > (index[2] * kSRegSize)) ? 0xfedcba98 : 0;
for (int i = 0; i < lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(expected_z12, z12.VnS(), i);
}
lane_count = core.GetSVELaneCount(kDRegSize);
uint64_t expected_z13 =
(vl > (index[3] * kDRegSize)) ? 0xfedcba9876543210 : 0;
for (int i = 0; i < lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(expected_z13, z13.VnD(), i);
}
lane_count = core.GetSVELaneCount(kDRegSize);
uint64_t expected_z14_lo = 0;
uint64_t expected_z14_hi = 0;
if (vl > (index[4] * kQRegSize)) {
expected_z14_lo = 0x0123456789abcdef;
expected_z14_hi = 0xfedcba9876543210;
}
for (int i = 0; i < lane_count; i += 2) {
ASSERT_EQUAL_SVE_LANE(expected_z14_lo, z14.VnD(), i);
ASSERT_EQUAL_SVE_LANE(expected_z14_hi, z14.VnD(), i + 1);
}
lane_count = core.GetSVELaneCount(kSRegSize);
uint64_t expected_z15 = (vl > (index[5] * kSRegSize)) ? 0x87868584 : 0;
for (int i = 0; i < lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(expected_z15, z15.VnS(), i);
}
lane_count = core.GetSVELaneCount(kBRegSize);
uint64_t expected_z16 = (vl > (index[6] * kBRegSize)) ? 0xff : 0;
for (int i = 0; i < lane_count; i++) {
ASSERT_EQUAL_SVE_LANE(expected_z16, z16.VnB(), i);
}
}
}
TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t z9_inputs[] = {0xfedcba9876543210,
0x0123456789abcdef,
0x8f8e8d8c8b8a8988,
0x8786858483828180};
InsrHelper(&masm, z9.VnD(), z9_inputs);
__ Sunpkhi(z10.VnH(), z9.VnB());
__ Sunpkhi(z11.VnS(), z9.VnH());
__ Sunpkhi(z12.VnD(), z9.VnS());
__ Sunpklo(z13.VnH(), z9.VnB());
__ Sunpklo(z14.VnS(), z9.VnH());
__ Sunpklo(z15.VnD(), z9.VnS());
__ Uunpkhi(z16.VnH(), z9.VnB());
__ Uunpkhi(z17.VnS(), z9.VnH());
__ Uunpkhi(z18.VnD(), z9.VnS());
__ Uunpklo(z19.VnH(), z9.VnB());
__ Uunpklo(z20.VnS(), z9.VnH());
__ Uunpklo(z21.VnD(), z9.VnS());
// Test unpacking with same source and destination.
__ Mov(z22, z9);
__ Sunpklo(z22.VnH(), z22.VnB());
__ Mov(z23, z9);
__ Uunpklo(z23.VnH(), z23.VnB());
END();
if (CAN_RUN()) {
RUN();
// Suunpkhi
int lane_count = core.GetSVELaneCount(kHRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint16_t expected = core.zreg_lane<uint16_t>(z10.GetCode(), i);
uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kSRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint32_t expected = core.zreg_lane<uint32_t>(z11.GetCode(), i);
uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kDRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint64_t expected = core.zreg_lane<uint64_t>(z12.GetCode(), i);
uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
ASSERT_EQUAL_64(expected, input);
}
// Suunpklo
lane_count = core.GetSVELaneCount(kHRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint16_t expected = core.zreg_lane<uint16_t>(z13.GetCode(), i);
uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i);
uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kSRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint32_t expected = core.zreg_lane<uint32_t>(z14.GetCode(), i);
uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i);
uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kDRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint64_t expected = core.zreg_lane<uint64_t>(z15.GetCode(), i);
uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i);
uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
ASSERT_EQUAL_64(expected, input);
}
// Uuunpkhi
lane_count = core.GetSVELaneCount(kHRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint16_t expected = core.zreg_lane<uint16_t>(z16.GetCode(), i);
uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kSRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint32_t expected = core.zreg_lane<uint32_t>(z17.GetCode(), i);
uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kDRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint64_t expected = core.zreg_lane<uint64_t>(z18.GetCode(), i);
uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
ASSERT_EQUAL_64(expected, input);
}
// Uuunpklo
lane_count = core.GetSVELaneCount(kHRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint16_t expected = core.zreg_lane<uint16_t>(z19.GetCode(), i);
uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kSRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint32_t expected = core.zreg_lane<uint32_t>(z20.GetCode(), i);
uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i);
ASSERT_EQUAL_64(expected, input);
}
lane_count = core.GetSVELaneCount(kDRegSize);
for (int i = lane_count - 1; i >= 0; i--) {
uint64_t expected = core.zreg_lane<uint64_t>(z21.GetCode(), i);
uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i);
ASSERT_EQUAL_64(expected, input);
}
ASSERT_EQUAL_SVE(z13, z22);
ASSERT_EQUAL_SVE(z19, z23);
}
}
TEST_SVE(sve_cnot_not) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0x0000000000000000, 0x00000000e1c30000, 0x123456789abcdef0};
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 1, 0
// For S lanes: 1, 1, 1, 0, 0
// For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
Initialise(&masm, p0.VnB(), pg_in);
PRegisterM pg = p0.Merging();
// These are merging operations, so we have to initialise the result register.
// We use a mixture of constructive and destructive operations.
InsrHelper(&masm, z31.VnD(), in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z30, z31);
// For constructive operations, use a different initial result value.
__ Index(z29.VnB(), 0, -1);
__ Mov(z0, z31);
__ Cnot(z0.VnB(), pg, z0.VnB()); // destructive
__ Mov(z1, z29);
__ Cnot(z1.VnH(), pg, z31.VnH());
__ Mov(z2, z31);
__ Cnot(z2.VnS(), pg, z2.VnS()); // destructive
__ Mov(z3, z29);
__ Cnot(z3.VnD(), pg, z31.VnD());
__ Mov(z4, z29);
__ Not(z4.VnB(), pg, z31.VnB());
__ Mov(z5, z31);
__ Not(z5.VnH(), pg, z5.VnH()); // destructive
__ Mov(z6, z29);
__ Not(z6.VnS(), pg, z31.VnS());
__ Mov(z7, z31);
__ Not(z7.VnD(), pg, z7.VnD()); // destructive
END();
if (CAN_RUN()) {
RUN();
// Check that constructive operations preserve their inputs.
ASSERT_EQUAL_SVE(z30, z31);
// clang-format off
// Cnot (B) destructive
uint64_t expected_z0[] =
// pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
{0x0000000001000101, 0x01000001e1000101, 0x12340078000000f0};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
// Cnot (H)
uint64_t expected_z1[] =
// pg: 0 0 0 1 0 1 1 1 0 0 1 0
{0xe9eaebecedee0001, 0xf1f2000100000001, 0xf9fafbfc0000ff00};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
// Cnot (S) destructive
uint64_t expected_z2[] =
// pg: 0 1 1 1 0 0
{0x0000000000000001, 0x0000000100000000, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
// Cnot (D)
uint64_t expected_z3[] =
// pg: 1 1 0
{0x0000000000000001, 0x0000000000000000, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
// Not (B)
uint64_t expected_z4[] =
// pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
{0xe9eaebecffeeffff, 0xfff2f3fff53cffff, 0xf9faa9fc65432100};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
// Not (H) destructive
uint64_t expected_z5[] =
// pg: 0 0 0 1 0 1 1 1 0 0 1 0
{0x000000000000ffff, 0x0000ffff1e3cffff, 0x123456786543def0};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
// Not (S)
uint64_t expected_z6[] =
// pg: 0 1 1 1 0 0
{0xe9eaebecffffffff, 0xffffffff1e3cffff, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
// Not (D) destructive
uint64_t expected_z7[] =
// pg: 1 1 0
{0xffffffffffffffff, 0xffffffff1e3cffff, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
// clang-format on
}
}
TEST_SVE(sve_fabs_fneg) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Include FP64, FP32 and FP16 signalling NaNs. Most FP operations quieten
// NaNs, but fabs and fneg do not.
uint64_t in[] = {0xc04500004228d140, // Recognisable (+/-42) values.
0xfff00000ff80fc01, // Signalling NaNs.
0x123456789abcdef0};
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 1, 0
// For S lanes: 1, 1, 1, 0, 0
// For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
Initialise(&masm, p0.VnB(), pg_in);
PRegisterM pg = p0.Merging();
// These are merging operations, so we have to initialise the result register.
// We use a mixture of constructive and destructive operations.
InsrHelper(&masm, z31.VnD(), in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z30, z31);
// For constructive operations, use a different initial result value.
__ Index(z29.VnB(), 0, -1);
__ Mov(z0, z29);
__ Fabs(z0.VnH(), pg, z31.VnH());
__ Mov(z1, z31);
__ Fabs(z1.VnS(), pg, z1.VnS()); // destructive
__ Mov(z2, z29);
__ Fabs(z2.VnD(), pg, z31.VnD());
__ Mov(z3, z31);
__ Fneg(z3.VnH(), pg, z3.VnH()); // destructive
__ Mov(z4, z29);
__ Fneg(z4.VnS(), pg, z31.VnS());
__ Mov(z5, z31);
__ Fneg(z5.VnD(), pg, z5.VnD()); // destructive
END();
if (CAN_RUN()) {
RUN();
// Check that constructive operations preserve their inputs.
ASSERT_EQUAL_SVE(z30, z31);
// clang-format off
// Fabs (H)
uint64_t expected_z0[] =
// pg: 0 0 0 1 0 1 1 1 0 0 1 0
{0xe9eaebecedee5140, 0xf1f200007f807c01, 0xf9fafbfc1abcff00};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
// Fabs (S) destructive
uint64_t expected_z1[] =
// pg: 0 1 1 1 0 0
{0xc04500004228d140, 0x7ff000007f80fc01, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
// Fabs (D)
uint64_t expected_z2[] =
// pg: 1 1 0
{0x404500004228d140, 0x7ff00000ff80fc01, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
// Fneg (H) destructive
uint64_t expected_z3[] =
// pg: 0 0 0 1 0 1 1 1 0 0 1 0
{0xc045000042285140, 0xfff080007f807c01, 0x123456781abcdef0};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
// Fneg (S)
uint64_t expected_z4[] =
// pg: 0 1 1 1 0 0
{0xe9eaebecc228d140, 0x7ff000007f80fc01, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
// Fneg (D) destructive
uint64_t expected_z5[] =
// pg: 1 1 0
{0x404500004228d140, 0x7ff00000ff80fc01, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
// clang-format on
}
}
TEST_SVE(sve_cls_clz_cnt) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0x0000000000000000, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 1, 0
// For S lanes: 1, 1, 1, 0, 0
// For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
Initialise(&masm, p0.VnB(), pg_in);
PRegisterM pg = p0.Merging();
// These are merging operations, so we have to initialise the result register.
// We use a mixture of constructive and destructive operations.
InsrHelper(&masm, z31.VnD(), in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z30, z31);
// For constructive operations, use a different initial result value.
__ Index(z29.VnB(), 0, -1);
__ Mov(z0, z29);
__ Cls(z0.VnB(), pg, z31.VnB());
__ Mov(z1, z31);
__ Clz(z1.VnH(), pg, z1.VnH()); // destructive
__ Mov(z2, z29);
__ Cnt(z2.VnS(), pg, z31.VnS());
__ Mov(z3, z31);
__ Cnt(z3.VnD(), pg, z3.VnD()); // destructive
END();
if (CAN_RUN()) {
RUN();
// Check that non-destructive operations preserve their inputs.
ASSERT_EQUAL_SVE(z30, z31);
// clang-format off
// cls (B)
uint8_t expected_z0[] =
// pg: 0 0 0 0 1 0 1 1
// pg: 1 0 0 1 0 1 1 1
// pg: 0 0 1 0 1 1 1 0
{0xe9, 0xea, 0xeb, 0xec, 7, 0xee, 7, 7,
6, 0xf2, 0xf3, 3, 0xf5, 1, 0, 3,
0xf9, 0xfa, 0, 0xfc, 0, 0, 1, 0x00};
ASSERT_EQUAL_SVE(expected_z0, z0.VnB());
// clz (H) destructive
uint16_t expected_z1[] =
// pg: 0 0 0 1
// pg: 0 1 1 1
// pg: 0 0 1 0
{0x0000, 0x0000, 0x0000, 16,
0xfefc, 0, 0, 0,
0x1234, 0x5678, 0, 0xdef0};
ASSERT_EQUAL_SVE(expected_z1, z1.VnH());
// cnt (S)
uint32_t expected_z2[] =
// pg: 0 1
// pg: 1 1
// pg: 0 0
{0xe9eaebec, 0,
22, 16,
0xf9fafbfc, 0xfdfeff00};
ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
// cnt (D) destructive
uint64_t expected_z3[] =
// pg: 1 1 0
{ 0, 38, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
// clang-format on
}
}
TEST_SVE(sve_sxt) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 1, 0
// For S lanes: 1, 1, 1, 0, 0
// For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
Initialise(&masm, p0.VnB(), pg_in);
PRegisterM pg = p0.Merging();
// These are merging operations, so we have to initialise the result register.
// We use a mixture of constructive and destructive operations.
InsrHelper(&masm, z31.VnD(), in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z30, z31);
// For constructive operations, use a different initial result value.
__ Index(z29.VnB(), 0, -1);
__ Mov(z0, z31);
__ Sxtb(z0.VnH(), pg, z0.VnH()); // destructive
__ Mov(z1, z29);
__ Sxtb(z1.VnS(), pg, z31.VnS());
__ Mov(z2, z31);
__ Sxtb(z2.VnD(), pg, z2.VnD()); // destructive
__ Mov(z3, z29);
__ Sxth(z3.VnS(), pg, z31.VnS());
__ Mov(z4, z31);
__ Sxth(z4.VnD(), pg, z4.VnD()); // destructive
__ Mov(z5, z29);
__ Sxtw(z5.VnD(), pg, z31.VnD());
END();
if (CAN_RUN()) {
RUN();
// Check that constructive operations preserve their inputs.
ASSERT_EQUAL_SVE(z30, z31);
// clang-format off
// Sxtb (H) destructive
uint64_t expected_z0[] =
// pg: 0 0 0 1 0 1 1 1 0 0 1 0
{0x01f203f405f6fff8, 0xfefcfff0ffc3000f, 0x12345678ffbcdef0};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
// Sxtb (S)
uint64_t expected_z1[] =
// pg: 0 1 1 1 0 0
{0xe9eaebecfffffff8, 0xfffffff00000000f, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
// Sxtb (D) destructive
uint64_t expected_z2[] =
// pg: 1 1 0
{0xfffffffffffffff8, 0x000000000000000f, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
// Sxth (S)
uint64_t expected_z3[] =
// pg: 0 1 1 1 0 0
{0xe9eaebec000007f8, 0xfffff8f0ffff870f, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
// Sxth (D) destructive
uint64_t expected_z4[] =
// pg: 1 1 0
{0x00000000000007f8, 0xffffffffffff870f, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
// Sxtw (D)
uint64_t expected_z5[] =
// pg: 1 1 0
{0x0000000005f607f8, 0xffffffffe1c3870f, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
// clang-format on
}
}
TEST_SVE(sve_uxt) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 1, 0
// For S lanes: 1, 1, 1, 0, 0
// For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
Initialise(&masm, p0.VnB(), pg_in);
PRegisterM pg = p0.Merging();
// These are merging operations, so we have to initialise the result register.
// We use a mixture of constructive and destructive operations.
InsrHelper(&masm, z31.VnD(), in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z30, z31);
// For constructive operations, use a different initial result value.
__ Index(z29.VnB(), 0, -1);
__ Mov(z0, z29);
__ Uxtb(z0.VnH(), pg, z31.VnH());
__ Mov(z1, z31);
__ Uxtb(z1.VnS(), pg, z1.VnS()); // destructive
__ Mov(z2, z29);
__ Uxtb(z2.VnD(), pg, z31.VnD());
__ Mov(z3, z31);
__ Uxth(z3.VnS(), pg, z3.VnS()); // destructive
__ Mov(z4, z29);
__ Uxth(z4.VnD(), pg, z31.VnD());
__ Mov(z5, z31);
__ Uxtw(z5.VnD(), pg, z5.VnD()); // destructive
END();
if (CAN_RUN()) {
RUN();
// clang-format off
// Uxtb (H)
uint64_t expected_z0[] =
// pg: 0 0 0 1 0 1 1 1 0 0 1 0
{0xe9eaebecedee00f8, 0xf1f200f000c3000f, 0xf9fafbfc00bcff00};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
// Uxtb (S) destructive
uint64_t expected_z1[] =
// pg: 0 1 1 1 0 0
{0x01f203f4000000f8, 0x000000f00000000f, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
// Uxtb (D)
uint64_t expected_z2[] =
// pg: 1 1 0
{0x00000000000000f8, 0x000000000000000f, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
// Uxth (S) destructive
uint64_t expected_z3[] =
// pg: 0 1 1 1 0 0
{0x01f203f4000007f8, 0x0000f8f00000870f, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
// Uxth (D)
uint64_t expected_z4[] =
// pg: 1 1 0
{0x00000000000007f8, 0x000000000000870f, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
// Uxtw (D) destructive
uint64_t expected_z5[] =
// pg: 1 1 0
{0x0000000005f607f8, 0x00000000e1c3870f, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
// clang-format on
}
}
TEST_SVE(sve_abs_neg) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 1, 0
// For S lanes: 1, 1, 1, 0, 0
// For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
Initialise(&masm, p0.VnB(), pg_in);
PRegisterM pg = p0.Merging();
InsrHelper(&masm, z31.VnD(), in);
// These are merging operations, so we have to initialise the result register.
// We use a mixture of constructive and destructive operations.
InsrHelper(&masm, z31.VnD(), in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z30, z31);
// For constructive operations, use a different initial result value.
__ Index(z29.VnB(), 0, -1);
__ Mov(z0, z31);
__ Abs(z0.VnD(), pg, z0.VnD()); // destructive
__ Mov(z1, z29);
__ Abs(z1.VnB(), pg, z31.VnB());
__ Mov(z2, z31);
__ Neg(z2.VnH(), pg, z2.VnH()); // destructive
__ Mov(z3, z29);
__ Neg(z3.VnS(), pg, z31.VnS());
// The unpredicated form of `Neg` is implemented using `subr`.
__ Mov(z4, z31);
__ Neg(z4.VnB(), z4.VnB()); // destructive
__ Mov(z5, z29);
__ Neg(z5.VnD(), z31.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z30, z31);
// clang-format off
// Abs (D) destructive
uint64_t expected_z0[] =
// pg: 1 1 0
{0x01f203f405f607f8, 0x0103070f1e3c78f1, 0x123456789abcdef0};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
// Abs (B)
uint64_t expected_z1[] =
// pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
{0xe9eaebec05ee0708, 0x02f2f310f53d790f, 0xf9fa56fc66442200};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
// Neg (H) destructive
uint64_t expected_z2[] =
// pg: 0 0 0 1 0 1 1 1 0 0 1 0
{0x01f203f405f6f808, 0xfefc07101e3d78f1, 0x123456786544def0};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
// Neg (S)
uint64_t expected_z3[] =
// pg: 0 1 1 1 0 0
{0xe9eaebecfa09f808, 0x010307101e3c78f1, 0xf9fafbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
// Neg (B) destructive, unpredicated
uint64_t expected_z4[] =
{0xff0efd0cfb0af908, 0x020408101f3d79f1, 0xeeccaa8866442210};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
// Neg (D) unpredicated
uint64_t expected_z5[] =
{0xfe0dfc0bfa09f808, 0x0103070f1e3c78f1, 0xedcba98765432110};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
// clang-format on
}
}
TEST_SVE(sve_cpy) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
START();
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 0, 1, 1
// For S lanes: 0, 1, 1, 0, 1
// For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
PRegisterM pg = p7.Merging();
Initialise(&masm, pg.VnB(), pg_in);
// These are merging operations, so we have to initialise the result registers
// for each operation.
for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
__ Index(ZRegister(i, kBRegSize), 0, -1);
}
// Recognisable values to copy.
__ Mov(x0, 0xdeadbeefdeadbe42);
__ Mov(x1, 0xdeadbeefdead8421);
__ Mov(x2, 0xdeadbeef80042001);
__ Mov(x3, 0x8000000420000001);
// Use NEON moves, to avoid testing SVE `cpy` against itself.
__ Dup(v28.V2D(), x0);
__ Dup(v29.V2D(), x1);
__ Dup(v30.V2D(), x2);
__ Dup(v31.V2D(), x3);
// Register forms (CPY_z_p_r)
__ Cpy(z0.VnB(), pg, w0);
__ Cpy(z1.VnH(), pg, x1); // X registers are accepted for small lanes.
__ Cpy(z2.VnS(), pg, w2);
__ Cpy(z3.VnD(), pg, x3);
// VRegister forms (CPY_z_p_v)
__ Cpy(z4.VnB(), pg, b28);
__ Cpy(z5.VnH(), pg, h29);
__ Cpy(z6.VnS(), pg, s30);
__ Cpy(z7.VnD(), pg, d31);
// Check that we can copy the stack pointer.
__ Mov(x10, sp);
__ Mov(sp, 0xabcabcabcabcabca); // Set sp to a known value.
__ Cpy(z16.VnB(), pg, sp);
__ Cpy(z17.VnH(), pg, wsp);
__ Cpy(z18.VnS(), pg, wsp);
__ Cpy(z19.VnD(), pg, sp);
__ Mov(sp, x10); // Restore sp.
END();
if (CAN_RUN()) {
RUN();
// clang-format off
uint64_t expected_b[] =
// pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
{0xe9eaebec424242f0, 0x42f2f34242f64242, 0xf942fbfcfdfeff42};
ASSERT_EQUAL_SVE(expected_b, z0.VnD());
ASSERT_EQUAL_SVE(expected_b, z4.VnD());
uint64_t expected_h[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebec8421eff0, 0xf1f28421f5f68421, 0x8421fbfcfdfe8421};
ASSERT_EQUAL_SVE(expected_h, z1.VnD());
ASSERT_EQUAL_SVE(expected_h, z5.VnD());
uint64_t expected_s[] =
// pg: 0 0 1 1 0 1
{0xe9eaebecedeeeff0, 0x8004200180042001, 0xf9fafbfc80042001};
ASSERT_EQUAL_SVE(expected_s, z2.VnD());
ASSERT_EQUAL_SVE(expected_s, z6.VnD());
uint64_t expected_d[] =
// pg: 0 1 1
{0xe9eaebecedeeeff0, 0x8000000420000001, 0x8000000420000001};
ASSERT_EQUAL_SVE(expected_d, z3.VnD());
ASSERT_EQUAL_SVE(expected_d, z7.VnD());
uint64_t expected_b_sp[] =
// pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
{0xe9eaebeccacacaf0, 0xcaf2f3cacaf6caca, 0xf9cafbfcfdfeffca};
ASSERT_EQUAL_SVE(expected_b_sp, z16.VnD());
uint64_t expected_h_sp[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebecabcaeff0, 0xf1f2abcaf5f6abca, 0xabcafbfcfdfeabca};
ASSERT_EQUAL_SVE(expected_h_sp, z17.VnD());
uint64_t expected_s_sp[] =
// pg: 0 0 1 1 0 1
{0xe9eaebecedeeeff0, 0xcabcabcacabcabca, 0xf9fafbfccabcabca};
ASSERT_EQUAL_SVE(expected_s_sp, z18.VnD());
uint64_t expected_d_sp[] =
// pg: 0 1 1
{0xe9eaebecedeeeff0, 0xabcabcabcabcabca, 0xabcabcabcabcabca};
ASSERT_EQUAL_SVE(expected_d_sp, z19.VnD());
// clang-format on
}
}
TEST_SVE(sve_cpy_imm) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 0, 1, 1
// For S lanes: 0, 1, 1, 0, 1
// For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
PRegister pg = p7;
Initialise(&masm, pg.VnB(), pg_in);
// These are (mostly) merging operations, so we have to initialise the result
// registers for each operation.
for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
__ Index(ZRegister(i, kBRegSize), 0, -1);
}
// Encodable integer forms (CPY_z_p_i)
__ Cpy(z0.VnB(), pg.Merging(), 0);
__ Cpy(z1.VnB(), pg.Zeroing(), 42);
__ Cpy(z2.VnB(), pg.Merging(), -42);
__ Cpy(z3.VnB(), pg.Zeroing(), 0xff);
__ Cpy(z4.VnH(), pg.Merging(), 127);
__ Cpy(z5.VnS(), pg.Zeroing(), -128);
__ Cpy(z6.VnD(), pg.Merging(), -1);
// Forms encodable using fcpy.
__ Cpy(z7.VnH(), pg.Merging(), Float16ToRawbits(Float16(-31.0)));
__ Cpy(z8.VnS(), pg.Zeroing(), FloatToRawbits(2.0f));
__ Cpy(z9.VnD(), pg.Merging(), DoubleToRawbits(-4.0));
// Other forms use a scratch register.
__ Cpy(z10.VnH(), pg.Merging(), 0xff);
__ Cpy(z11.VnD(), pg.Zeroing(), 0x0123456789abcdef);
END();
if (CAN_RUN()) {
RUN();
// clang-format off
uint64_t expected_z0[] =
// pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
{0xe9eaebec000000f0, 0x00f2f30000f60000, 0xf900fbfcfdfeff00};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
uint64_t expected_z1[] =
// pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
{0x000000002a2a2a00, 0x2a00002a2a002a2a, 0x002a00000000002a};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
uint64_t expected_z2[] =
// pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
{0xe9eaebecd6d6d6f0, 0xd6f2f3d6d6f6d6d6, 0xf9d6fbfcfdfeffd6};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] =
// pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
{0x00000000ffffff00, 0xff0000ffff00ffff, 0x00ff0000000000ff};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebec007feff0, 0xf1f2007ff5f6007f, 0x007ffbfcfdfe007f};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] =
// pg: 0 0 1 1 0 1
{0x0000000000000000, 0xffffff80ffffff80, 0x00000000ffffff80};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] =
// pg: 0 1 1
{0xe9eaebecedeeeff0, 0xffffffffffffffff, 0xffffffffffffffff};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebeccfc0eff0, 0xf1f2cfc0f5f6cfc0, 0xcfc0fbfcfdfecfc0};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] =
// pg: 0 0 1 1 0 1
{0x0000000000000000, 0x4000000040000000, 0x0000000040000000};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] =
// pg: 0 1 1
{0xe9eaebecedeeeff0, 0xc010000000000000, 0xc010000000000000};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebec00ffeff0, 0xf1f200fff5f600ff, 0x00fffbfcfdfe00ff};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
uint64_t expected_z11[] =
// pg: 0 1 1
{0x0000000000000000, 0x0123456789abcdef, 0x0123456789abcdef};
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
// clang-format on
}
}
TEST_SVE(sve_fcpy_imm) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 0, 1, 1
// For S lanes: 0, 1, 1, 0, 1
// For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
PRegister pg = p7;
Initialise(&masm, pg.VnB(), pg_in);
// These are (mostly) merging operations, so we have to initialise the result
// registers for each operation.
for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
__ Index(ZRegister(i, kBRegSize), 0, -1);
}
// Encodable floating-point forms (FCPY_z_p_i)
__ Fcpy(z1.VnH(), pg.Merging(), Float16(1.0));
__ Fcpy(z2.VnH(), pg.Merging(), -2.0f);
__ Fcpy(z3.VnH(), pg.Merging(), 3.0);
__ Fcpy(z4.VnS(), pg.Merging(), Float16(-4.0));
__ Fcpy(z5.VnS(), pg.Merging(), 5.0f);
__ Fcpy(z6.VnS(), pg.Merging(), 6.0);
__ Fcpy(z7.VnD(), pg.Merging(), Float16(7.0));
__ Fcpy(z8.VnD(), pg.Merging(), 8.0f);
__ Fmov(z9.VnD(), pg.Merging(), -9.0);
// Unencodable immediates.
__ Fcpy(z10.VnS(), pg.Merging(), 0.0);
__ Fcpy(z11.VnH(), pg.Merging(), Float16(42.0));
__ Fcpy(z12.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
__ Fcpy(z13.VnH(), pg.Merging(), kFP64NegativeInfinity);
// Fmov alias.
__ Fmov(z14.VnS(), pg.Merging(), 0.0);
__ Fmov(z15.VnH(), pg.Merging(), Float16(42.0));
__ Fmov(z16.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
__ Fmov(z17.VnH(), pg.Merging(), kFP64NegativeInfinity);
END();
if (CAN_RUN()) {
RUN();
// clang-format off
// 1.0 as FP16: 0x3c00
uint64_t expected_z1[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebec3c00eff0, 0xf1f23c00f5f63c00, 0x3c00fbfcfdfe3c00};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
// -2.0 as FP16: 0xc000
uint64_t expected_z2[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebecc000eff0, 0xf1f2c000f5f6c000, 0xc000fbfcfdfec000};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
// 3.0 as FP16: 0x4200
uint64_t expected_z3[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebec4200eff0, 0xf1f24200f5f64200, 0x4200fbfcfdfe4200};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
// -4.0 as FP32: 0xc0800000
uint64_t expected_z4[] =
// pg: 0 0 1 1 0 1
{0xe9eaebecedeeeff0, 0xc0800000c0800000, 0xf9fafbfcc0800000};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
// 5.0 as FP32: 0x40a00000
uint64_t expected_z5[] =
// pg: 0 0 1 1 0 1
{0xe9eaebecedeeeff0, 0x40a0000040a00000, 0xf9fafbfc40a00000};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
// 6.0 as FP32: 0x40c00000
uint64_t expected_z6[] =
// pg: 0 0 1 1 0 1
{0xe9eaebecedeeeff0, 0x40c0000040c00000, 0xf9fafbfc40c00000};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
// 7.0 as FP64: 0x401c000000000000
uint64_t expected_z7[] =
// pg: 0 1 1
{0xe9eaebecedeeeff0, 0x401c000000000000, 0x401c000000000000};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
// 8.0 as FP64: 0x4020000000000000
uint64_t expected_z8[] =
// pg: 0 1 1
{0xe9eaebecedeeeff0, 0x4020000000000000, 0x4020000000000000};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
// -9.0 as FP64: 0xc022000000000000
uint64_t expected_z9[] =
// pg: 0 1 1
{0xe9eaebecedeeeff0, 0xc022000000000000, 0xc022000000000000};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
// 0.0 as FP32: 0x00000000
uint64_t expected_z10[] =
// pg: 0 0 1 1 0 1
{0xe9eaebecedeeeff0, 0x0000000000000000, 0xf9fafbfc00000000};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
// 42.0 as FP16: 0x5140
uint64_t expected_z11[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebec5140eff0, 0xf1f25140f5f65140, 0x5140fbfcfdfe5140};
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
// Signalling NaN (with payload): 0x7ff0000012340000
uint64_t expected_z12[] =
// pg: 0 1 1
{0xe9eaebecedeeeff0, 0x7ff0000012340000, 0x7ff0000012340000};
ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
// -infinity as FP16: 0xfc00
uint64_t expected_z13[] =
// pg: 0 0 1 0 0 1 0 1 1 0 0 1
{0xe9eaebecfc00eff0, 0xf1f2fc00f5f6fc00, 0xfc00fbfcfdfefc00};
ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
ASSERT_EQUAL_SVE(z10.VnD(), z14.VnD());
ASSERT_EQUAL_SVE(z11.VnD(), z15.VnD());
ASSERT_EQUAL_SVE(z12.VnD(), z16.VnD());
ASSERT_EQUAL_SVE(z13.VnD(), z17.VnD());
// clang-format on
}
}
TEST_SVE(sve_permute_vector_unpredicated_table_lookup) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t table_inputs[] = {0xffeeddccbbaa9988, 0x7766554433221100};
int index_b[] = {255, 255, 11, 10, 15, 14, 13, 12, 1, 0, 4, 3, 7, 6, 5, 4};
int index_h[] = {5, 6, 7, 8, 2, 3, 6, 4};
int index_s[] = {1, 3, 2, 31, -1};
int index_d[] = {31, 1};
// Initialize the register with a value that doesn't existed in the table.
__ Dup(z9.VnB(), 0x1f);
InsrHelper(&masm, z9.VnD(), table_inputs);
ZRegister ind_b = z0.WithLaneSize(kBRegSize);
ZRegister ind_h = z1.WithLaneSize(kHRegSize);
ZRegister ind_s = z2.WithLaneSize(kSRegSize);
ZRegister ind_d = z3.WithLaneSize(kDRegSize);
InsrHelper(&masm, ind_b, index_b);
InsrHelper(&masm, ind_h, index_h);
InsrHelper(&masm, ind_s, index_s);
InsrHelper(&masm, ind_d, index_d);
__ Tbl(z26.VnB(), z9.VnB(), ind_b);
__ Tbl(z27.VnH(), z9.VnH(), ind_h);
__ Tbl(z28.VnS(), z9.VnS(), ind_s);
__ Tbl(z29.VnD(), z9.VnD(), ind_d);
END();
if (CAN_RUN()) {
RUN();
// clang-format off
unsigned z26_expected[] = {0x1f, 0x1f, 0xbb, 0xaa, 0xff, 0xee, 0xdd, 0xcc,
0x11, 0x00, 0x44, 0x33, 0x77, 0x66, 0x55, 0x44};
unsigned z27_expected[] = {0xbbaa, 0xddcc, 0xffee, 0x1f1f,
0x5544, 0x7766, 0xddcc, 0x9988};
unsigned z28_expected[] =
{0x77665544, 0xffeeddcc, 0xbbaa9988, 0x1f1f1f1f, 0x1f1f1f1f};
uint64_t z29_expected[] = {0x1f1f1f1f1f1f1f1f, 0xffeeddccbbaa9988};
// clang-format on
unsigned vl = config->sve_vl_in_bits();
for (size_t i = 0; i < ArrayLength(index_b); i++) {
int lane = static_cast<int>(ArrayLength(index_b) - i - 1);
if (!core.HasSVELane(z26.VnB(), lane)) break;
uint64_t expected = (vl > (index_b[i] * kBRegSize)) ? z26_expected[i] : 0;
ASSERT_EQUAL_SVE_LANE(expected, z26.VnB(), lane);
}
for (size_t i = 0; i < ArrayLength(index_h); i++) {
int lane = static_cast<int>(ArrayLength(index_h) - i - 1);
if (!core.HasSVELane(z27.VnH(), lane)) break;
uint64_t expected = (vl > (index_h[i] * kHRegSize)) ? z27_expected[i] : 0;
ASSERT_EQUAL_SVE_LANE(expected, z27.VnH(), lane);
}
for (size_t i = 0; i < ArrayLength(index_s); i++) {
int lane = static_cast<int>(ArrayLength(index_s) - i - 1);
if (!core.HasSVELane(z28.VnS(), lane)) break;
uint64_t expected = (vl > (index_s[i] * kSRegSize)) ? z28_expected[i] : 0;
ASSERT_EQUAL_SVE_LANE(expected, z28.VnS(), lane);
}
for (size_t i = 0; i < ArrayLength(index_d); i++) {
int lane = static_cast<int>(ArrayLength(index_d) - i - 1);
if (!core.HasSVELane(z29.VnD(), lane)) break;
uint64_t expected = (vl > (index_d[i] * kDRegSize)) ? z29_expected[i] : 0;
ASSERT_EQUAL_SVE_LANE(expected, z29.VnD(), lane);
}
}
}
TEST_SVE(ldr_str_z_bi) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
// The immediate can address [-256, 255] times the VL, so allocate enough
// space to exceed that in both directions.
int data_size = vl * 1024;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
__ Index(z1.VnB(), 1, 3);
__ Index(z2.VnB(), 2, 5);
__ Index(z3.VnB(), 3, 7);
__ Index(z4.VnB(), 4, 11);
__ Index(z5.VnB(), 5, 13);
__ Index(z6.VnB(), 6, 2);
__ Index(z7.VnB(), 7, 3);
__ Index(z8.VnB(), 8, 5);
__ Index(z9.VnB(), 9, 7);
// Encodable cases.
__ Str(z1, SVEMemOperand(x0));
__ Str(z2, SVEMemOperand(x0, 2, SVE_MUL_VL));
__ Str(z3, SVEMemOperand(x0, -3, SVE_MUL_VL));
__ Str(z4, SVEMemOperand(x0, 255, SVE_MUL_VL));
__ Str(z5, SVEMemOperand(x0, -256, SVE_MUL_VL));
// Cases that fall back on `CalculateSVEAddress`.
__ Str(z6, SVEMemOperand(x0, 6 * vl));
__ Str(z7, SVEMemOperand(x0, -7 * vl));
__ Str(z8, SVEMemOperand(x0, 314, SVE_MUL_VL));
__ Str(z9, SVEMemOperand(x0, -314, SVE_MUL_VL));
// Corresponding loads.
__ Ldr(z11, SVEMemOperand(x0, xzr)); // Test xzr operand.
__ Ldr(z12, SVEMemOperand(x0, 2, SVE_MUL_VL));
__ Ldr(z13, SVEMemOperand(x0, -3, SVE_MUL_VL));
__ Ldr(z14, SVEMemOperand(x0, 255, SVE_MUL_VL));
__ Ldr(z15, SVEMemOperand(x0, -256, SVE_MUL_VL));
__ Ldr(z16, SVEMemOperand(x0, 6 * vl));
__ Ldr(z17, SVEMemOperand(x0, -7 * vl));
__ Ldr(z18, SVEMemOperand(x0, 314, SVE_MUL_VL));
__ Ldr(z19, SVEMemOperand(x0, -314, SVE_MUL_VL));
END();
if (CAN_RUN()) {
RUN();
uint8_t* expected = new uint8_t[data_size];
memset(expected, 0, data_size);
uint8_t* middle = &expected[data_size / 2];
for (int i = 0; i < vl; i++) {
middle[i] = (1 + (3 * i)) & 0xff; // z1
middle[(2 * vl) + i] = (2 + (5 * i)) & 0xff; // z2
middle[(-3 * vl) + i] = (3 + (7 * i)) & 0xff; // z3
middle[(255 * vl) + i] = (4 + (11 * i)) & 0xff; // z4
middle[(-256 * vl) + i] = (5 + (13 * i)) & 0xff; // z5
middle[(6 * vl) + i] = (6 + (2 * i)) & 0xff; // z6
middle[(-7 * vl) + i] = (7 + (3 * i)) & 0xff; // z7
middle[(314 * vl) + i] = (8 + (5 * i)) & 0xff; // z8
middle[(-314 * vl) + i] = (9 + (7 * i)) & 0xff; // z9
}
ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
ASSERT_EQUAL_SVE(z1, z11);
ASSERT_EQUAL_SVE(z2, z12);
ASSERT_EQUAL_SVE(z3, z13);
ASSERT_EQUAL_SVE(z4, z14);
ASSERT_EQUAL_SVE(z5, z15);
ASSERT_EQUAL_SVE(z6, z16);
ASSERT_EQUAL_SVE(z7, z17);
ASSERT_EQUAL_SVE(z8, z18);
ASSERT_EQUAL_SVE(z9, z19);
delete[] expected;
}
delete[] data;
}
TEST_SVE(ldr_str_p_bi) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
int pl = vl / kZRegBitsPerPRegBit;
// The immediate can address [-256, 255] times the PL, so allocate enough
// space to exceed that in both directions.
int data_size = pl * 1024;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
uint64_t pattern[4] = {0x1010101011101111,
0x0010111011000101,
0x1001101110010110,
0x1010110101100011};
for (int i = 8; i <= 15; i++) {
// Initialise p8-p15 with a conveniently-recognisable, non-zero pattern.
Initialise(&masm,
PRegister(i),
pattern[3] * i,
pattern[2] * i,
pattern[1] * i,
pattern[0] * i);
}
// Encodable cases.
__ Str(p8, SVEMemOperand(x0));
__ Str(p9, SVEMemOperand(x0, 2, SVE_MUL_VL));
__ Str(p10, SVEMemOperand(x0, -3, SVE_MUL_VL));
__ Str(p11, SVEMemOperand(x0, 255, SVE_MUL_VL));
// Cases that fall back on `CalculateSVEAddress`.
__ Str(p12, SVEMemOperand(x0, 6 * pl));
__ Str(p13, SVEMemOperand(x0, -7 * pl));
__ Str(p14, SVEMemOperand(x0, 314, SVE_MUL_VL));
__ Str(p15, SVEMemOperand(x0, -314, SVE_MUL_VL));
// Corresponding loads.
__ Ldr(p0, SVEMemOperand(x0));
__ Ldr(p1, SVEMemOperand(x0, 2, SVE_MUL_VL));
__ Ldr(p2, SVEMemOperand(x0, -3, SVE_MUL_VL));
__ Ldr(p3, SVEMemOperand(x0, 255, SVE_MUL_VL));
__ Ldr(p4, SVEMemOperand(x0, 6 * pl));
__ Ldr(p5, SVEMemOperand(x0, -7 * pl));
__ Ldr(p6, SVEMemOperand(x0, 314, SVE_MUL_VL));
__ Ldr(p7, SVEMemOperand(x0, -314, SVE_MUL_VL));
END();
if (CAN_RUN()) {
RUN();
uint8_t* expected = new uint8_t[data_size];
memset(expected, 0, data_size);
uint8_t* middle = &expected[data_size / 2];
for (int i = 0; i < pl; i++) {
int bit_index = (i % sizeof(pattern[0])) * kBitsPerByte;
size_t index = i / sizeof(pattern[0]);
VIXL_ASSERT(index < ArrayLength(pattern));
uint64_t byte = (pattern[index] >> bit_index) & 0xff;
// Each byte of `pattern` can be multiplied by 15 without carry.
VIXL_ASSERT((byte * 15) <= 0xff);
middle[i] = byte * 8; // p8
middle[(2 * pl) + i] = byte * 9; // p9
middle[(-3 * pl) + i] = byte * 10; // p10
middle[(255 * pl) + i] = byte * 11; // p11
middle[(6 * pl) + i] = byte * 12; // p12
middle[(-7 * pl) + i] = byte * 13; // p13
middle[(314 * pl) + i] = byte * 14; // p14
middle[(-314 * pl) + i] = byte * 15; // p15
}
ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
ASSERT_EQUAL_SVE(p0, p8);
ASSERT_EQUAL_SVE(p1, p9);
ASSERT_EQUAL_SVE(p2, p10);
ASSERT_EQUAL_SVE(p3, p11);
ASSERT_EQUAL_SVE(p4, p12);
ASSERT_EQUAL_SVE(p5, p13);
ASSERT_EQUAL_SVE(p6, p14);
ASSERT_EQUAL_SVE(p7, p15);
delete[] expected;
}
delete[] data;
}
template <typename T>
static void MemoryWrite(uint8_t* base, int64_t offset, int64_t index, T data) {
memcpy(base + offset + (index * sizeof(data)), &data, sizeof(data));
}
TEST_SVE(sve_ld1_st1_contiguous) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
// The immediate can address [-8, 7] times the VL, so allocate enough space to
// exceed that in both directions.
int data_size = vl * 128;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
// Encodable scalar-plus-immediate cases.
__ Index(z1.VnB(), 1, -3);
__ Ptrue(p1.VnB());
__ St1b(z1.VnB(), p1, SVEMemOperand(x0));
__ Index(z2.VnH(), -2, 5);
__ Ptrue(p2.VnH(), SVE_MUL3);
__ St1b(z2.VnH(), p2, SVEMemOperand(x0, 7, SVE_MUL_VL));
__ Index(z3.VnS(), 3, -7);
__ Ptrue(p3.VnS(), SVE_POW2);
__ St1h(z3.VnS(), p3, SVEMemOperand(x0, -8, SVE_MUL_VL));
// Encodable scalar-plus-scalar cases.
__ Index(z4.VnD(), -4, 11);
__ Ptrue(p4.VnD(), SVE_VL3);
__ Addvl(x1, x0, 8); // Try not to overlap with VL-dependent cases.
__ Mov(x2, 17);
__ St1b(z4.VnD(), p4, SVEMemOperand(x1, x2));
__ Index(z5.VnD(), 6, -2);
__ Ptrue(p5.VnD(), SVE_VL16);
__ Addvl(x3, x0, 10); // Try not to overlap with VL-dependent cases.
__ Mov(x4, 6);
__ St1d(z5.VnD(), p5, SVEMemOperand(x3, x4, LSL, 3));
// Unencodable cases fall back on `CalculateSVEAddress`.
__ Index(z6.VnS(), -7, 3);
// Setting SVE_ALL on B lanes checks that the Simulator ignores irrelevant
// predicate bits when handling larger lanes.
__ Ptrue(p6.VnB(), SVE_ALL);
__ St1w(z6.VnS(), p6, SVEMemOperand(x0, 42, SVE_MUL_VL));
__ Index(z7.VnD(), 32, -11);
__ Ptrue(p7.VnD(), SVE_MUL4);
__ St1w(z7.VnD(), p7, SVEMemOperand(x0, 22, SVE_MUL_VL));
// Corresponding loads.
__ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0));
__ Ld1b(z9.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
__ Ld1h(z10.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
__ Ld1b(z11.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
__ Ld1d(z12.VnD(), p5.Zeroing(), SVEMemOperand(x3, x4, LSL, 3));
__ Ld1w(z13.VnS(), p6.Zeroing(), SVEMemOperand(x0, 42, SVE_MUL_VL));
__ Ld1sb(z14.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
__ Ld1sh(z15.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
__ Ld1sb(z16.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
__ Ld1sw(z17.VnD(), p7.Zeroing(), SVEMemOperand(x0, 22, SVE_MUL_VL));
// We can test ld1 by comparing the value loaded with the value stored. In
// most cases, there are two complications:
// - Loads have zeroing predication, so we have to clear the inactive
// elements on our reference.
// - We have to replicate any sign- or zero-extension.
// Ld1b(z8.VnB(), ...)
__ Dup(z18.VnB(), 0);
__ Mov(z18.VnB(), p1.Merging(), z1.VnB());
// Ld1b(z9.VnH(), ...)
__ Dup(z19.VnH(), 0);
__ Uxtb(z19.VnH(), p2.Merging(), z2.VnH());
// Ld1h(z10.VnS(), ...)
__ Dup(z20.VnS(), 0);
__ Uxth(z20.VnS(), p3.Merging(), z3.VnS());
// Ld1b(z11.VnD(), ...)
__ Dup(z21.VnD(), 0);
__ Uxtb(z21.VnD(), p4.Merging(), z4.VnD());
// Ld1d(z12.VnD(), ...)
__ Dup(z22.VnD(), 0);
__ Mov(z22.VnD(), p5.Merging(), z5.VnD());
// Ld1w(z13.VnS(), ...)
__ Dup(z23.VnS(), 0);
__ Mov(z23.VnS(), p6.Merging(), z6.VnS());
// Ld1sb(z14.VnH(), ...)
__ Dup(z24.VnH(), 0);
__ Sxtb(z24.VnH(), p2.Merging(), z2.VnH());
// Ld1sh(z15.VnS(), ...)
__ Dup(z25.VnS(), 0);
__ Sxth(z25.VnS(), p3.Merging(), z3.VnS());
// Ld1sb(z16.VnD(), ...)
__ Dup(z26.VnD(), 0);
__ Sxtb(z26.VnD(), p4.Merging(), z4.VnD());
// Ld1sw(z17.VnD(), ...)
__ Dup(z27.VnD(), 0);
__ Sxtw(z27.VnD(), p7.Merging(), z7.VnD());
END();
if (CAN_RUN()) {
RUN();
uint8_t* expected = new uint8_t[data_size];
memset(expected, 0, data_size);
uint8_t* middle = &expected[data_size / 2];
int vl_b = vl / kBRegSizeInBytes;
int vl_h = vl / kHRegSizeInBytes;
int vl_s = vl / kSRegSizeInBytes;
int vl_d = vl / kDRegSizeInBytes;
// Encodable cases.
// st1b { z1.b }, SVE_ALL
for (int i = 0; i < vl_b; i++) {
MemoryWrite(middle, 0, i, static_cast<uint8_t>(1 - (3 * i)));
}
// st1b { z2.h }, SVE_MUL3
int vl_h_mul3 = vl_h - (vl_h % 3);
for (int i = 0; i < vl_h_mul3; i++) {
int64_t offset = 7 * static_cast<int>(vl / (kHRegSize / kBRegSize));
MemoryWrite(middle, offset, i, static_cast<uint8_t>(-2 + (5 * i)));
}
// st1h { z3.s }, SVE_POW2
int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
for (int i = 0; i < vl_s_pow2; i++) {
int64_t offset = -8 * static_cast<int>(vl / (kSRegSize / kHRegSize));
MemoryWrite(middle, offset, i, static_cast<uint16_t>(3 - (7 * i)));
}
// st1b { z4.d }, SVE_VL3
if (vl_d >= 3) {
for (int i = 0; i < 3; i++) {
MemoryWrite(middle,
(8 * vl) + 17,
i,
static_cast<uint8_t>(-4 + (11 * i)));
}
}
// st1d { z5.d }, SVE_VL16
if (vl_d >= 16) {
for (int i = 0; i < 16; i++) {
MemoryWrite(middle,
(10 * vl) + (6 * kDRegSizeInBytes),
i,
static_cast<uint64_t>(6 - (2 * i)));
}
}
// Unencodable cases.
// st1w { z6.s }, SVE_ALL
for (int i = 0; i < vl_s; i++) {
MemoryWrite(middle, 42 * vl, i, static_cast<uint32_t>(-7 + (3 * i)));
}
// st1w { z7.d }, SVE_MUL4
int vl_d_mul4 = vl_d - (vl_d % 4);
for (int i = 0; i < vl_d_mul4; i++) {
int64_t offset = 22 * static_cast<int>(vl / (kDRegSize / kWRegSize));
MemoryWrite(middle, offset, i, static_cast<uint32_t>(32 + (-11 * i)));
}
ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
// Check that we loaded back the expected values.
ASSERT_EQUAL_SVE(z18, z8);
ASSERT_EQUAL_SVE(z19, z9);
ASSERT_EQUAL_SVE(z20, z10);
ASSERT_EQUAL_SVE(z21, z11);
ASSERT_EQUAL_SVE(z22, z12);
ASSERT_EQUAL_SVE(z23, z13);
ASSERT_EQUAL_SVE(z24, z14);
ASSERT_EQUAL_SVE(z25, z15);
ASSERT_EQUAL_SVE(z26, z16);
ASSERT_EQUAL_SVE(z27, z17);
delete[] expected;
}
delete[] data;
}
TEST_SVE(sve_ld2_st2_scalar_plus_imm) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
// The immediate can address [-16, 14] times the VL, so allocate enough space
// to exceed that in both directions.
int data_size = vl * 128;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
__ Index(z14.VnB(), 1, -3);
__ Index(z15.VnB(), 2, -3);
__ Ptrue(p0.VnB());
__ St2b(z14.VnB(), z15.VnB(), p0, SVEMemOperand(x0));
__ Index(z16.VnH(), -2, 5);
__ Index(z17.VnH(), -3, 5);
__ Ptrue(p1.VnH(), SVE_MUL3);
__ St2h(z16.VnH(), z17.VnH(), p1, SVEMemOperand(x0, 8, SVE_MUL_VL));
// Wrap around from z31 to z0.
__ Index(z31.VnS(), 3, -7);
__ Index(z0.VnS(), 4, -7);
__ Ptrue(p2.VnS(), SVE_POW2);
__ St2w(z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL));
__ Index(z18.VnD(), -7, 3);
__ Index(z19.VnD(), -8, 3);
// Sparse predication, including some irrelevant bits (0xe). To make the
// results easy to check, activate each lane <n> where n is a multiple of 5.
Initialise(&masm,
p3,
0xeee10000000001ee,
0xeeeeeee100000000,
0x01eeeeeeeee10000,
0x000001eeeeeeeee1);
__ St2d(z18.VnD(), z19.VnD(), p3, SVEMemOperand(x0, 14, SVE_MUL_VL));
// We can test ld2 by comparing the values loaded with the values stored.
// There are two complications:
// - Loads have zeroing predication, so we have to clear the inactive
// elements on our reference.
// - We want to test both loads and stores that span { z31, z0 }, so we have
// to move some values around.
//
// Registers z4-z11 will hold as-stored values (with inactive elements
// cleared). Registers z20-z27 will hold the values that were loaded.
// Ld2b(z14.VnB(), z15.VnB(), ...)
__ Dup(z4.VnB(), 0);
__ Dup(z5.VnB(), 0);
__ Mov(z4.VnB(), p0.Merging(), z14.VnB());
__ Mov(z5.VnB(), p0.Merging(), z15.VnB());
// Ld2h(z16.VnH(), z17.VnH(), ...)
__ Dup(z6.VnH(), 0);
__ Dup(z7.VnH(), 0);
__ Mov(z6.VnH(), p1.Merging(), z16.VnH());
__ Mov(z7.VnH(), p1.Merging(), z17.VnH());
// Ld2w(z31.VnS(), z0.VnS(), ...)
__ Dup(z8.VnS(), 0);
__ Dup(z9.VnS(), 0);
__ Mov(z8.VnS(), p2.Merging(), z31.VnS());
__ Mov(z9.VnS(), p2.Merging(), z0.VnS());
// Ld2d(z18.VnD(), z19.VnD(), ...)
__ Dup(z10.VnD(), 0);
__ Dup(z11.VnD(), 0);
__ Mov(z10.VnD(), p3.Merging(), z18.VnD());
__ Mov(z11.VnD(), p3.Merging(), z19.VnD());
// Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
__ Ld2b(z31.VnB(), z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
__ Mov(z20, z31);
__ Mov(z21, z0);
__ Ld2h(z22.VnH(), z23.VnH(), p1.Zeroing(), SVEMemOperand(x0, 8, SVE_MUL_VL));
__ Ld2w(z24.VnS(),
z25.VnS(),
p2.Zeroing(),
SVEMemOperand(x0, -12, SVE_MUL_VL));
__ Ld2d(z26.VnD(),
z27.VnD(),
p3.Zeroing(),
SVEMemOperand(x0, 14, SVE_MUL_VL));
END();
if (CAN_RUN()) {
RUN();
uint8_t* expected = new uint8_t[data_size];
memset(expected, 0, data_size);
uint8_t* middle = &expected[data_size / 2];
int vl_b = vl / kBRegSizeInBytes;
int vl_h = vl / kHRegSizeInBytes;
int vl_s = vl / kSRegSizeInBytes;
int vl_d = vl / kDRegSizeInBytes;
int reg_count = 2;
// st2b { z14.b, z15.b }, SVE_ALL
for (int i = 0; i < vl_b; i++) {
uint8_t lane0 = 1 - (3 * i);
uint8_t lane1 = 2 - (3 * i);
MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
}
// st2h { z16.h, z17.h }, SVE_MUL3
int vl_h_mul3 = vl_h - (vl_h % 3);
for (int i = 0; i < vl_h_mul3; i++) {
int64_t offset = 8 * vl;
uint16_t lane0 = -2 + (5 * i);
uint16_t lane1 = -3 + (5 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
}
// st2w { z31.s, z0.s }, SVE_POW2
int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
for (int i = 0; i < vl_s_pow2; i++) {
int64_t offset = -12 * vl;
uint32_t lane0 = 3 - (7 * i);
uint32_t lane1 = 4 - (7 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
}
// st2d { z18.d, z19.d }, ((i % 5) == 0)
for (int i = 0; i < vl_d; i++) {
if ((i % 5) == 0) {
int64_t offset = 14 * vl;
uint64_t lane0 = -7 + (3 * i);
uint64_t lane1 = -8 + (3 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
}
}
ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
// Check that we loaded back the expected values.
// st2b/ld2b
ASSERT_EQUAL_SVE(z4, z20);
ASSERT_EQUAL_SVE(z5, z21);
// st2h/ld2h
ASSERT_EQUAL_SVE(z6, z22);
ASSERT_EQUAL_SVE(z7, z23);
// st2w/ld2w
ASSERT_EQUAL_SVE(z8, z24);
ASSERT_EQUAL_SVE(z9, z25);
// st2d/ld2d
ASSERT_EQUAL_SVE(z10, z26);
ASSERT_EQUAL_SVE(z11, z27);
delete[] expected;
}
delete[] data;
}
TEST_SVE(sve_ld2_st2_scalar_plus_scalar) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
// Allocate plenty of space to enable indexing in both directions.
int data_size = vl * 128;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
__ Index(z10.VnB(), -4, 11);
__ Index(z11.VnB(), -5, 11);
__ Ptrue(p7.VnB(), SVE_MUL4);
__ Mov(x1, 0);
__ St2b(z10.VnB(), z11.VnB(), p7, SVEMemOperand(x0, x1));
__ Index(z12.VnH(), 6, -2);
__ Index(z13.VnH(), 7, -2);
__ Ptrue(p6.VnH(), SVE_VL16);
__ Rdvl(x2, 3); // Make offsets VL-dependent so we can avoid overlap.
__ St2h(z12.VnH(), z13.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
__ Index(z14.VnS(), -7, 3);
__ Index(z15.VnS(), -8, 3);
// Sparse predication, including some irrelevant bits (0xe). To make the
// results easy to check, activate each lane <n> where n is a multiple of 5.
Initialise(&masm,
p5,
0xeee1000010000100,
0x001eeee100001000,
0x0100001eeee10000,
0x10000100001eeee1);
__ Rdvl(x3, -3);
__ St2w(z14.VnS(), z15.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
// Wrap around from z31 to z0.
__ Index(z31.VnD(), 32, -11);
__ Index(z0.VnD(), 33, -11);
__ Ptrue(p4.VnD(), SVE_MUL3);
__ Rdvl(x4, 1);
__ St2d(z31.VnD(), z0.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
// We can test ld2 by comparing the values loaded with the values stored.
// There are two complications:
// - Loads have zeroing predication, so we have to clear the inactive
// elements on our reference.
// - We want to test both loads and stores that span { z31, z0 }, so we have
// to move some values around.
//
// Registers z4-z11 will hold as-stored values (with inactive elements
// cleared). Registers z20-z27 will hold the values that were loaded.
// Ld2b(z20.VnB(), z21.VnB(), ...)
__ Dup(z4.VnB(), 0);
__ Dup(z5.VnB(), 0);
__ Mov(z4.VnB(), p7.Merging(), z10.VnB());
__ Mov(z5.VnB(), p7.Merging(), z11.VnB());
// Ld2h(z22.VnH(), z23.VnH(), ...)
__ Dup(z6.VnH(), 0);
__ Dup(z7.VnH(), 0);
__ Mov(z6.VnH(), p6.Merging(), z12.VnH());
__ Mov(z7.VnH(), p6.Merging(), z13.VnH());
// Ld2w(z24.VnS(), z25.VnS(), ...)
__ Dup(z8.VnS(), 0);
__ Dup(z9.VnS(), 0);
__ Mov(z8.VnS(), p5.Merging(), z14.VnS());
__ Mov(z9.VnS(), p5.Merging(), z15.VnS());
// Ld2d(z31.VnD(), z0.VnD(), ...)
__ Dup(z10.VnD(), 0);
__ Dup(z11.VnD(), 0);
__ Mov(z10.VnD(), p4.Merging(), z31.VnD());
__ Mov(z11.VnD(), p4.Merging(), z0.VnD());
// Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
__ Ld2b(z31.VnB(), z0.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1));
__ Mov(z20, z31);
__ Mov(z21, z0);
__ Ld2h(z22.VnH(), z23.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
__ Ld2w(z24.VnS(), z25.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2));
__ Ld2d(z26.VnD(), z27.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3));
END();
if (CAN_RUN()) {
RUN();
uint8_t* expected = new uint8_t[data_size];
memset(expected, 0, data_size);
uint8_t* middle = &expected[data_size / 2];
int vl_b = vl / kBRegSizeInBytes;
int vl_h = vl / kHRegSizeInBytes;
int vl_s = vl / kSRegSizeInBytes;
int vl_d = vl / kDRegSizeInBytes;
int reg_count = 2;
// st2b { z10.b, z11.b }, SVE_MUL4
int vl_b_mul4 = vl_b - (vl_b % 4);
for (int i = 0; i < vl_b_mul4; i++) {
uint8_t lane0 = -4 + (11 * i);
uint8_t lane1 = -5 + (11 * i);
MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
}
// st2h { z12.h, z13.h }, SVE_VL16
if (vl_h >= 16) {
for (int i = 0; i < 16; i++) {
int64_t offset = (3 << kHRegSizeInBytesLog2) * vl;
uint16_t lane0 = 6 - (2 * i);
uint16_t lane1 = 7 - (2 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
}
}
// st2w { z14.s, z15.s }, ((i % 5) == 0)
for (int i = 0; i < vl_s; i++) {
if ((i % 5) == 0) {
int64_t offset = -(3 << kSRegSizeInBytesLog2) * vl;
uint32_t lane0 = -7 + (3 * i);
uint32_t lane1 = -8 + (3 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
}
}
// st2d { z31.b, z0.b }, SVE_MUL3
int vl_d_mul3 = vl_d - (vl_d % 3);
for (int i = 0; i < vl_d_mul3; i++) {
int64_t offset = (1 << kDRegSizeInBytesLog2) * vl;
uint64_t lane0 = 32 - (11 * i);
uint64_t lane1 = 33 - (11 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
}
ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
// Check that we loaded back the expected values.
// st2b/ld2b
ASSERT_EQUAL_SVE(z4, z20);
ASSERT_EQUAL_SVE(z5, z21);
// st2h/ld2h
ASSERT_EQUAL_SVE(z6, z22);
ASSERT_EQUAL_SVE(z7, z23);
// st2w/ld2w
ASSERT_EQUAL_SVE(z8, z24);
ASSERT_EQUAL_SVE(z9, z25);
// st2d/ld2d
ASSERT_EQUAL_SVE(z10, z26);
ASSERT_EQUAL_SVE(z11, z27);
delete[] expected;
}
delete[] data;
}
TEST_SVE(sve_ld3_st3_scalar_plus_imm) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
// The immediate can address [-24, 21] times the VL, so allocate enough space
// to exceed that in both directions.
int data_size = vl * 128;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
// We can test ld3 by comparing the values loaded with the values stored.
// There are two complications:
// - Loads have zeroing predication, so we have to clear the inactive
// elements on our reference.
// - We want to test both loads and stores that span { z31, z0 }, so we have
// to move some values around.
//
// Registers z4-z15 will hold as-stored values (with inactive elements
// cleared). Registers z16-z27 will hold the values that were loaded.
__ Index(z10.VnB(), 1, -3);
__ Index(z11.VnB(), 2, -3);
__ Index(z12.VnB(), 3, -3);
__ Ptrue(p0.VnB());
__ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p0, SVEMemOperand(x0));
// Save the stored values for ld3 tests.
__ Dup(z4.VnB(), 0);
__ Dup(z5.VnB(), 0);
__ Dup(z6.VnB(), 0);
__ Mov(z4.VnB(), p0.Merging(), z10.VnB());
__ Mov(z5.VnB(), p0.Merging(), z11.VnB());
__ Mov(z6.VnB(), p0.Merging(), z12.VnB());
// Wrap around from z31 to z0.
__ Index(z31.VnH(), -2, 5);
__ Index(z0.VnH(), -3, 5);
__ Index(z1.VnH(), -4, 5);
__ Ptrue(p1.VnH(), SVE_MUL3);
__ St3h(z31.VnH(), z0.VnH(), z1.VnH(), p1, SVEMemOperand(x0, 9, SVE_MUL_VL));
// Save the stored values for ld3 tests.
__ Dup(z7.VnH(), 0);
__ Dup(z8.VnH(), 0);
__ Dup(z9.VnH(), 0);
__ Mov(z7.VnH(), p1.Merging(), z31.VnH());
__ Mov(z8.VnH(), p1.Merging(), z0.VnH());
__ Mov(z9.VnH(), p1.Merging(), z1.VnH());
__ Index(z30.VnS(), 3, -7);
__ Index(z31.VnS(), 4, -7);
__ Index(z0.VnS(), 5, -7);
__ Ptrue(p2.VnS(), SVE_POW2);
__ St3w(z30.VnS(),
z31.VnS(),
z0.VnS(),
p2,
SVEMemOperand(x0, -12, SVE_MUL_VL));
// Save the stored values for ld3 tests.
__ Dup(z10.VnS(), 0);
__ Dup(z11.VnS(), 0);
__ Dup(z12.VnS(), 0);
__ Mov(z10.VnS(), p2.Merging(), z30.VnS());
__ Mov(z11.VnS(), p2.Merging(), z31.VnS());
__ Mov(z12.VnS(), p2.Merging(), z0.VnS());
__ Index(z0.VnD(), -7, 3);
__ Index(z1.VnD(), -8, 3);
__ Index(z2.VnD(), -9, 3);
// Sparse predication, including some irrelevant bits (0xee). To make the
// results easy to check, activate each lane <n> where n is a multiple of 5.
Initialise(&masm,
p3,
0xeee10000000001ee,
0xeeeeeee100000000,
0x01eeeeeeeee10000,
0x000001eeeeeeeee1);
__ St3d(z0.VnD(), z1.VnD(), z2.VnD(), p3, SVEMemOperand(x0, 15, SVE_MUL_VL));
// Save the stored values for ld3 tests.
__ Dup(z13.VnD(), 0);
__ Dup(z14.VnD(), 0);
__ Dup(z15.VnD(), 0);
__ Mov(z13.VnD(), p3.Merging(), z0.VnD());
__ Mov(z14.VnD(), p3.Merging(), z1.VnD());
__ Mov(z15.VnD(), p3.Merging(), z2.VnD());
// Corresponding loads.
// Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
__ Ld3b(z31.VnB(), z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(x0));
__ Mov(z16, z31);
__ Mov(z17, z0);
__ Mov(z18, z1);
__ Ld3h(z30.VnH(),
z31.VnH(),
z0.VnH(),
p1.Zeroing(),
SVEMemOperand(x0, 9, SVE_MUL_VL));
__ Mov(z19, z30);
__ Mov(z20, z31);
__ Mov(z21, z0);
__ Ld3w(z22.VnS(),
z23.VnS(),
z24.VnS(),
p2.Zeroing(),
SVEMemOperand(x0, -12, SVE_MUL_VL));
__ Ld3d(z25.VnD(),
z26.VnD(),
z27.VnD(),
p3.Zeroing(),
SVEMemOperand(x0, 15, SVE_MUL_VL));
END();
if (CAN_RUN()) {
RUN();
uint8_t* expected = new uint8_t[data_size];
memset(expected, 0, data_size);
uint8_t* middle = &expected[data_size / 2];
int vl_b = vl / kBRegSizeInBytes;
int vl_h = vl / kHRegSizeInBytes;
int vl_s = vl / kSRegSizeInBytes;
int vl_d = vl / kDRegSizeInBytes;
int reg_count = 3;
// st3b { z10.b, z11.b, z12.b }, SVE_ALL
for (int i = 0; i < vl_b; i++) {
uint8_t lane0 = 1 - (3 * i);
uint8_t lane1 = 2 - (3 * i);
uint8_t lane2 = 3 - (3 * i);
MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
}
// st3h { z31.h, z0.h, z1.h }, SVE_MUL3
int vl_h_mul3 = vl_h - (vl_h % 3);
for (int i = 0; i < vl_h_mul3; i++) {
int64_t offset = 9 * vl;
uint16_t lane0 = -2 + (5 * i);
uint16_t lane1 = -3 + (5 * i);
uint16_t lane2 = -4 + (5 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
}
// st3w { z30.s, z31.s, z0.s }, SVE_POW2
int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
for (int i = 0; i < vl_s_pow2; i++) {
int64_t offset = -12 * vl;
uint32_t lane0 = 3 - (7 * i);
uint32_t lane1 = 4 - (7 * i);
uint32_t lane2 = 5 - (7 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
}
// st3d { z0.d, z1.d, z2.d }, ((i % 5) == 0)
for (int i = 0; i < vl_d; i++) {
if ((i % 5) == 0) {
int64_t offset = 15 * vl;
uint64_t lane0 = -7 + (3 * i);
uint64_t lane1 = -8 + (3 * i);
uint64_t lane2 = -9 + (3 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
}
}
ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
// Check that we loaded back the expected values.
// st3b/ld3b
ASSERT_EQUAL_SVE(z4, z16);
ASSERT_EQUAL_SVE(z5, z17);
ASSERT_EQUAL_SVE(z6, z18);
// st3h/ld3h
ASSERT_EQUAL_SVE(z7, z19);
ASSERT_EQUAL_SVE(z8, z20);
ASSERT_EQUAL_SVE(z9, z21);
// st3w/ld3w
ASSERT_EQUAL_SVE(z10, z22);
ASSERT_EQUAL_SVE(z11, z23);
ASSERT_EQUAL_SVE(z12, z24);
// st3d/ld3d
ASSERT_EQUAL_SVE(z13, z25);
ASSERT_EQUAL_SVE(z14, z26);
ASSERT_EQUAL_SVE(z15, z27);
delete[] expected;
}
delete[] data;
}
TEST_SVE(sve_ld3_st3_scalar_plus_scalar) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
// Allocate plenty of space to enable indexing in both directions.
int data_size = vl * 128;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
// We can test ld3 by comparing the values loaded with the values stored.
// There are two complications:
// - Loads have zeroing predication, so we have to clear the inactive
// elements on our reference.
// - We want to test both loads and stores that span { z31, z0 }, so we have
// to move some values around.
//
// Registers z4-z15 will hold as-stored values (with inactive elements
// cleared). Registers z16-z27 will hold the values that were loaded.
__ Index(z10.VnB(), -4, 11);
__ Index(z11.VnB(), -5, 11);
__ Index(z12.VnB(), -6, 11);
__ Ptrue(p7.VnB(), SVE_MUL4);
__ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
__ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p7, SVEMemOperand(x0, x1, LSL, 0));
// Save the stored values for ld3 tests.
__ Dup(z4.VnB(), 0);
__ Dup(z5.VnB(), 0);
__ Dup(z6.VnB(), 0);
__ Mov(z4.VnB(), p7.Merging(), z10.VnB());
__ Mov(z5.VnB(), p7.Merging(), z11.VnB());
__ Mov(z6.VnB(), p7.Merging(), z12.VnB());
__ Index(z13.VnH(), 6, -2);
__ Index(z14.VnH(), 7, -2);
__ Index(z15.VnH(), 8, -2);
__ Ptrue(p6.VnH(), SVE_VL16);
__ Rdvl(x2, 5); // (5 * vl) << 1 = 10 * vl
__ St3h(z13.VnH(), z14.VnH(), z15.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
// Save the stored values for ld3 tests.
__ Dup(z7.VnH(), 0);
__ Dup(z8.VnH(), 0);
__ Dup(z9.VnH(), 0);
__ Mov(z7.VnH(), p6.Merging(), z13.VnH());
__ Mov(z8.VnH(), p6.Merging(), z14.VnH());
__ Mov(z9.VnH(), p6.Merging(), z15.VnH());
// Wrap around from z31 to z0.
__ Index(z30.VnS(), -7, 3);
__ Index(z31.VnS(), -8, 3);
__ Index(z0.VnS(), -9, 3);
// Sparse predication, including some irrelevant bits (0xe). To make the
// results easy to check, activate each lane <n> where n is a multiple of 5.
Initialise(&masm,
p5,
0xeee1000010000100,
0x001eeee100001000,
0x0100001eeee10000,
0x10000100001eeee1);
__ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
__ St3w(z30.VnS(), z31.VnS(), z0.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
// Save the stored values for ld3 tests.
__ Dup(z10.VnS(), 0);
__ Dup(z11.VnS(), 0);
__ Dup(z12.VnS(), 0);
__ Mov(z10.VnS(), p5.Merging(), z30.VnS());
__ Mov(z11.VnS(), p5.Merging(), z31.VnS());
__ Mov(z12.VnS(), p5.Merging(), z0.VnS());
__ Index(z31.VnD(), 32, -11);
__ Index(z0.VnD(), 33, -11);
__ Index(z1.VnD(), 34, -11);
__ Ptrue(p4.VnD(), SVE_MUL3);
__ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 * vl
__ St3d(z31.VnD(), z0.VnD(), z1.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
// Save the stored values for ld3 tests.
__ Dup(z13.VnD(), 0);
__ Dup(z14.VnD(), 0);
__ Dup(z15.VnD(), 0);
__ Mov(z13.VnD(), p4.Merging(), z31.VnD());
__ Mov(z14.VnD(), p4.Merging(), z0.VnD());
__ Mov(z15.VnD(), p4.Merging(), z1.VnD());
// Corresponding loads.
// Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
__ Ld3b(z31.VnB(),
z0.VnB(),
z1.VnB(),
p7.Zeroing(),
SVEMemOperand(x0, x1, LSL, 0));
__ Mov(z16, z31);
__ Mov(z17, z0);
__ Mov(z18, z1);
__ Ld3h(z30.VnH(),
z31.VnH(),
z0.VnH(),
p6.Zeroing(),
SVEMemOperand(x0, x2, LSL, 1));
__ Mov(z19, z30);
__ Mov(z20, z31);
__ Mov(z21, z0);
__ Ld3w(z22.VnS(),
z23.VnS(),
z24.VnS(),
p5.Zeroing(),
SVEMemOperand(x0, x3, LSL, 2));
__ Ld3d(z25.VnD(),
z26.VnD(),
z27.VnD(),
p4.Zeroing(),
SVEMemOperand(x0, x4, LSL, 3));
END();
if (CAN_RUN()) {
RUN();
uint8_t* expected = new uint8_t[data_size];
memset(expected, 0, data_size);
uint8_t* middle = &expected[data_size / 2];
int vl_b = vl / kBRegSizeInBytes;
int vl_h = vl / kHRegSizeInBytes;
int vl_s = vl / kSRegSizeInBytes;
int vl_d = vl / kDRegSizeInBytes;
int reg_count = 3;
// st3b { z10.b, z11.b, z12.b }, SVE_MUL4
int vl_b_mul4 = vl_b - (vl_b % 4);
for (int i = 0; i < vl_b_mul4; i++) {
int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
uint8_t lane0 = -4 + (11 * i);
uint8_t lane1 = -5 + (11 * i);
uint8_t lane2 = -6 + (11 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
}
// st3h { z13.h, z14.h, z15.h }, SVE_VL16
if (vl_h >= 16) {
for (int i = 0; i < 16; i++) {
int64_t offset = (5 << kHRegSizeInBytesLog2) * vl;
uint16_t lane0 = 6 - (2 * i);
uint16_t lane1 = 7 - (2 * i);
uint16_t lane2 = 8 - (2 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
}
}
// st3w { z30.s, z31.s, z0.s }, ((i % 5) == 0)
for (int i = 0; i < vl_s; i++) {
if ((i % 5) == 0) {
int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
uint32_t lane0 = -7 + (3 * i);
uint32_t lane1 = -8 + (3 * i);
uint32_t lane2 = -9 + (3 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
}
}
// st3d { z31.d, z0.d, z1.d }, SVE_MUL3
int vl_d_mul3 = vl_d - (vl_d % 3);
for (int i = 0; i < vl_d_mul3; i++) {
int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
uint64_t lane0 = 32 - (11 * i);
uint64_t lane1 = 33 - (11 * i);
uint64_t lane2 = 34 - (11 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
}
ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
// Check that we loaded back the expected values.
// st3b/ld3b
ASSERT_EQUAL_SVE(z4, z16);
ASSERT_EQUAL_SVE(z5, z17);
ASSERT_EQUAL_SVE(z6, z18);
// st3h/ld3h
ASSERT_EQUAL_SVE(z7, z19);
ASSERT_EQUAL_SVE(z8, z20);
ASSERT_EQUAL_SVE(z9, z21);
// st3w/ld3w
ASSERT_EQUAL_SVE(z10, z22);
ASSERT_EQUAL_SVE(z11, z23);
ASSERT_EQUAL_SVE(z12, z24);
// st3d/ld3d
ASSERT_EQUAL_SVE(z13, z25);
ASSERT_EQUAL_SVE(z14, z26);
ASSERT_EQUAL_SVE(z15, z27);
delete[] expected;
}
delete[] data;
}
TEST_SVE(sve_ld4_st4_scalar_plus_imm) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
// The immediate can address [-24, 21] times the VL, so allocate enough space
// to exceed that in both directions.
int data_size = vl * 128;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
// We can test ld4 by comparing the values loaded with the values stored.
// There are two complications:
// - Loads have zeroing predication, so we have to clear the inactive
// elements on our reference.
// - We want to test both loads and stores that span { z31, z0 }, so we have
// to move some values around.
//
// Registers z3-z18 will hold as-stored values (with inactive elements
// cleared). Registers z19-z31 and z0-z2 will hold the values that were
// loaded.
__ Index(z10.VnB(), 1, -7);
__ Index(z11.VnB(), 2, -7);
__ Index(z12.VnB(), 3, -7);
__ Index(z13.VnB(), 4, -7);
__ Ptrue(p0.VnB());
__ St4b(z10.VnB(), z11.VnB(), z12.VnB(), z13.VnB(), p0, SVEMemOperand(x0));
// Save the stored values for ld4 tests.
__ Dup(z3.VnB(), 0);
__ Dup(z4.VnB(), 0);
__ Dup(z5.VnB(), 0);
__ Dup(z6.VnB(), 0);
__ Mov(z3.VnB(), p0.Merging(), z10.VnB());
__ Mov(z4.VnB(), p0.Merging(), z11.VnB());
__ Mov(z5.VnB(), p0.Merging(), z12.VnB());
__ Mov(z6.VnB(), p0.Merging(), z13.VnB());
// Wrap around from z31 to z0.
__ Index(z31.VnH(), -2, 5);
__ Index(z0.VnH(), -3, 5);
__ Index(z1.VnH(), -4, 5);
__ Index(z2.VnH(), -5, 5);
__ Ptrue(p1.VnH(), SVE_MUL3);
__ St4h(z31.VnH(),
z0.VnH(),
z1.VnH(),
z2.VnH(),
p1,
SVEMemOperand(x0, 4, SVE_MUL_VL));
// Save the stored values for ld4 tests.
__ Dup(z7.VnH(), 0);
__ Dup(z8.VnH(), 0);
__ Dup(z9.VnH(), 0);
__ Dup(z10.VnH(), 0);
__ Mov(z7.VnH(), p1.Merging(), z31.VnH());
__ Mov(z8.VnH(), p1.Merging(), z0.VnH());
__ Mov(z9.VnH(), p1.Merging(), z1.VnH());
__ Mov(z10.VnH(), p1.Merging(), z2.VnH());
// Wrap around from z31 to z0.
__ Index(z29.VnS(), 2, -7);
__ Index(z30.VnS(), 3, -7);
__ Index(z31.VnS(), 4, -7);
__ Index(z0.VnS(), 5, -7);
__ Ptrue(p2.VnS(), SVE_POW2);
__ St4w(z29.VnS(),
z30.VnS(),
z31.VnS(),
z0.VnS(),
p2,
SVEMemOperand(x0, -12, SVE_MUL_VL));
// Save the stored values for ld4 tests.
__ Dup(z11.VnS(), 0);
__ Dup(z12.VnS(), 0);
__ Dup(z13.VnS(), 0);
__ Dup(z14.VnS(), 0);
__ Mov(z11.VnS(), p2.Merging(), z29.VnS());
__ Mov(z12.VnS(), p2.Merging(), z30.VnS());
__ Mov(z13.VnS(), p2.Merging(), z31.VnS());
__ Mov(z14.VnS(), p2.Merging(), z0.VnS());
__ Index(z20.VnD(), -7, 8);
__ Index(z21.VnD(), -8, 8);
__ Index(z22.VnD(), -9, 8);
__ Index(z23.VnD(), -10, 8);
// Sparse predication, including some irrelevant bits (0xee). To make the
// results easy to check, activate each lane <n> where n is a multiple of 5.
Initialise(&masm,
p3,
0xeee10000000001ee,
0xeeeeeee100000000,
0x01eeeeeeeee10000,
0x000001eeeeeeeee1);
__ St4d(z20.VnD(),
z21.VnD(),
z22.VnD(),
z23.VnD(),
p3,
SVEMemOperand(x0, 16, SVE_MUL_VL));
// Save the stored values for ld4 tests.
__ Dup(z15.VnD(), 0);
__ Dup(z16.VnD(), 0);
__ Dup(z17.VnD(), 0);
__ Dup(z18.VnD(), 0);
__ Mov(z15.VnD(), p3.Merging(), z20.VnD());
__ Mov(z16.VnD(), p3.Merging(), z21.VnD());
__ Mov(z17.VnD(), p3.Merging(), z22.VnD());
__ Mov(z18.VnD(), p3.Merging(), z23.VnD());
// Corresponding loads.
// Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
__ Ld4b(z31.VnB(),
z0.VnB(),
z1.VnB(),
z2.VnB(),
p0.Zeroing(),
SVEMemOperand(x0));
__ Mov(z19, z31);
__ Mov(z20, z0);
__ Mov(z21, z1);
__ Mov(z22, z2);
__ Ld4h(z23.VnH(),
z24.VnH(),
z25.VnH(),
z26.VnH(),
p1.Zeroing(),
SVEMemOperand(x0, 4, SVE_MUL_VL));
__ Ld4w(z27.VnS(),
z28.VnS(),
z29.VnS(),
z30.VnS(),
p2.Zeroing(),
SVEMemOperand(x0, -12, SVE_MUL_VL));
// Wrap around from z31 to z0.
__ Ld4d(z31.VnD(),
z0.VnD(),
z1.VnD(),
z2.VnD(),
p3.Zeroing(),
SVEMemOperand(x0, 16, SVE_MUL_VL));
END();
if (CAN_RUN()) {
RUN();
uint8_t* expected = new uint8_t[data_size];
memset(expected, 0, data_size);
uint8_t* middle = &expected[data_size / 2];
int vl_b = vl / kBRegSizeInBytes;
int vl_h = vl / kHRegSizeInBytes;
int vl_s = vl / kSRegSizeInBytes;
int vl_d = vl / kDRegSizeInBytes;
int reg_count = 4;
// st2b { z10.b, z11.b, z12.b, z13.b }, SVE_ALL
for (int i = 0; i < vl_b; i++) {
uint8_t lane0 = 1 - (7 * i);
uint8_t lane1 = 2 - (7 * i);
uint8_t lane2 = 3 - (7 * i);
uint8_t lane3 = 4 - (7 * i);
MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
MemoryWrite(middle, 0, (i * reg_count) + 3, lane3);
}
// st4h { z31.h, z0.h, z1.h, z2.h }, SVE_MUL3
int vl_h_mul3 = vl_h - (vl_h % 3);
for (int i = 0; i < vl_h_mul3; i++) {
int64_t offset = 4 * vl;
uint16_t lane0 = -2 + (5 * i);
uint16_t lane1 = -3 + (5 * i);
uint16_t lane2 = -4 + (5 * i);
uint16_t lane3 = -5 + (5 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
}
// st4w { z29.s, z30.s, z31.s, z0.s }, SVE_POW2
int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
for (int i = 0; i < vl_s_pow2; i++) {
int64_t offset = -12 * vl;
uint32_t lane0 = 2 - (7 * i);
uint32_t lane1 = 3 - (7 * i);
uint32_t lane2 = 4 - (7 * i);
uint32_t lane3 = 5 - (7 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
}
// st4d { z20.d, z21.d, z22.d, z23.d }, ((i % 5) == 0)
for (int i = 0; i < vl_d; i++) {
if ((i % 5) == 0) {
int64_t offset = 16 * vl;
uint64_t lane0 = -7 + (8 * i);
uint64_t lane1 = -8 + (8 * i);
uint64_t lane2 = -9 + (8 * i);
uint64_t lane3 = -10 + (8 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
}
}
ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
// Check that we loaded back the expected values.
// st4b/ld4b
ASSERT_EQUAL_SVE(z3, z19);
ASSERT_EQUAL_SVE(z4, z20);
ASSERT_EQUAL_SVE(z5, z21);
ASSERT_EQUAL_SVE(z6, z22);
// st4h/ld4h
ASSERT_EQUAL_SVE(z7, z23);
ASSERT_EQUAL_SVE(z8, z24);
ASSERT_EQUAL_SVE(z9, z25);
ASSERT_EQUAL_SVE(z10, z26);
// st4w/ld4w
ASSERT_EQUAL_SVE(z11, z27);
ASSERT_EQUAL_SVE(z12, z28);
ASSERT_EQUAL_SVE(z13, z29);
ASSERT_EQUAL_SVE(z14, z30);
// st4d/ld4d
ASSERT_EQUAL_SVE(z15, z31);
ASSERT_EQUAL_SVE(z16, z0);
ASSERT_EQUAL_SVE(z17, z1);
ASSERT_EQUAL_SVE(z18, z2);
delete[] expected;
}
delete[] data;
}
TEST_SVE(sve_ld4_st4_scalar_plus_scalar) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
// Allocate plenty of space to enable indexing in both directions.
int data_size = vl * 128;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
// We can test ld4 by comparing the values loaded with the values stored.
// There are two complications:
// - Loads have zeroing predication, so we have to clear the inactive
// elements on our reference.
// - We want to test both loads and stores that span { z31, z0 }, so we have
// to move some values around.
//
// Registers z3-z18 will hold as-stored values (with inactive elements
// cleared). Registers z19-z31 and z0-z2 will hold the values that were
// loaded.
__ Index(z19.VnB(), -4, 11);
__ Index(z20.VnB(), -5, 11);
__ Index(z21.VnB(), -6, 11);
__ Index(z22.VnB(), -7, 11);
__ Ptrue(p7.VnB(), SVE_MUL4);
__ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
__ St4b(z19.VnB(),
z20.VnB(),
z21.VnB(),
z22.VnB(),
p7,
SVEMemOperand(x0, x1, LSL, 0));
// Save the stored values for ld4 tests.
__ Dup(z3.VnB(), 0);
__ Dup(z4.VnB(), 0);
__ Dup(z5.VnB(), 0);
__ Dup(z6.VnB(), 0);
__ Mov(z3.VnB(), p7.Merging(), z19.VnB());
__ Mov(z4.VnB(), p7.Merging(), z20.VnB());
__ Mov(z5.VnB(), p7.Merging(), z21.VnB());
__ Mov(z6.VnB(), p7.Merging(), z22.VnB());
__ Index(z23.VnH(), 6, -2);
__ Index(z24.VnH(), 7, -2);
__ Index(z25.VnH(), 8, -2);
__ Index(z26.VnH(), 9, -2);
__ Ptrue(p6.VnH(), SVE_VL16);
__ Rdvl(x2, 7); // (7 * vl) << 1 = 14 * vl
__ St4h(z23.VnH(),
z24.VnH(),
z25.VnH(),
z26.VnH(),
p6,
SVEMemOperand(x0, x2, LSL, 1));
// Save the stored values for ld4 tests.
__ Dup(z7.VnH(), 0);
__ Dup(z8.VnH(), 0);
__ Dup(z9.VnH(), 0);
__ Dup(z10.VnH(), 0);
__ Mov(z7.VnH(), p6.Merging(), z23.VnH());
__ Mov(z8.VnH(), p6.Merging(), z24.VnH());
__ Mov(z9.VnH(), p6.Merging(), z25.VnH());
__ Mov(z10.VnH(), p6.Merging(), z26.VnH());
// Wrap around from z31 to z0.
__ Index(z29.VnS(), -6, 7);
__ Index(z30.VnS(), -7, 7);
__ Index(z31.VnS(), -8, 7);
__ Index(z0.VnS(), -9, 7);
// Sparse predication, including some irrelevant bits (0xe). To make the
// results easy to check, activate each lane <n> where n is a multiple of 5.
Initialise(&masm,
p5,
0xeee1000010000100,
0x001eeee100001000,
0x0100001eeee10000,
0x10000100001eeee1);
__ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
__ St4w(z29.VnS(),
z30.VnS(),
z31.VnS(),
z0.VnS(),
p5,
SVEMemOperand(x0, x3, LSL, 2));
// Save the stored values for ld4 tests.
__ Dup(z11.VnS(), 0);
__ Dup(z12.VnS(), 0);
__ Dup(z13.VnS(), 0);
__ Dup(z14.VnS(), 0);
__ Mov(z11.VnS(), p5.Merging(), z29.VnS());
__ Mov(z12.VnS(), p5.Merging(), z30.VnS());
__ Mov(z13.VnS(), p5.Merging(), z31.VnS());
__ Mov(z14.VnS(), p5.Merging(), z0.VnS());
__ Index(z31.VnD(), 32, -11);
__ Index(z0.VnD(), 33, -11);
__ Index(z1.VnD(), 34, -11);
__ Index(z2.VnD(), 35, -11);
__ Ptrue(p4.VnD(), SVE_MUL3);
__ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 *vl
__ St4d(z31.VnD(),
z0.VnD(),
z1.VnD(),
z2.VnD(),
p4,
SVEMemOperand(x0, x4, LSL, 3));
// Save the stored values for ld4 tests.
__ Dup(z15.VnD(), 0);
__ Dup(z16.VnD(), 0);
__ Dup(z17.VnD(), 0);
__ Dup(z18.VnD(), 0);
__ Mov(z15.VnD(), p4.Merging(), z31.VnD());
__ Mov(z16.VnD(), p4.Merging(), z0.VnD());
__ Mov(z17.VnD(), p4.Merging(), z1.VnD());
__ Mov(z18.VnD(), p4.Merging(), z2.VnD());
// Corresponding loads.
// Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
__ Ld4b(z31.VnB(),
z0.VnB(),
z1.VnB(),
z2.VnB(),
p7.Zeroing(),
SVEMemOperand(x0, x1, LSL, 0));
__ Mov(z19, z31);
__ Mov(z20, z0);
__ Mov(z21, z1);
__ Mov(z22, z2);
__ Ld4h(z23.VnH(),
z24.VnH(),
z25.VnH(),
z26.VnH(),
p6.Zeroing(),
SVEMemOperand(x0, x2, LSL, 1));
__ Ld4w(z27.VnS(),
z28.VnS(),
z29.VnS(),
z30.VnS(),
p5.Zeroing(),
SVEMemOperand(x0, x3, LSL, 2));
// Wrap around from z31 to z0.
__ Ld4d(z31.VnD(),
z0.VnD(),
z1.VnD(),
z2.VnD(),
p4.Zeroing(),
SVEMemOperand(x0, x4, LSL, 3));
END();
if (CAN_RUN()) {
RUN();
uint8_t* expected = new uint8_t[data_size];
memset(expected, 0, data_size);
uint8_t* middle = &expected[data_size / 2];
int vl_b = vl / kBRegSizeInBytes;
int vl_h = vl / kHRegSizeInBytes;
int vl_s = vl / kSRegSizeInBytes;
int vl_d = vl / kDRegSizeInBytes;
int reg_count = 4;
// st4b { z19.b, z20.b, z21.b, z22.b }, SVE_MUL4
int vl_b_mul4 = vl_b - (vl_b % 4);
for (int i = 0; i < vl_b_mul4; i++) {
int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
uint8_t lane0 = -4 + (11 * i);
uint8_t lane1 = -5 + (11 * i);
uint8_t lane2 = -6 + (11 * i);
uint8_t lane3 = -7 + (11 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
}
// st4h { z22.h, z23.h, z24.h, z25.h }, SVE_VL16
if (vl_h >= 16) {
for (int i = 0; i < 16; i++) {
int64_t offset = (7 << kHRegSizeInBytesLog2) * vl;
uint16_t lane0 = 6 - (2 * i);
uint16_t lane1 = 7 - (2 * i);
uint16_t lane2 = 8 - (2 * i);
uint16_t lane3 = 9 - (2 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
}
}
// st4w { z29.s, z30.s, z31.s, z0.s }, ((i % 5) == 0)
for (int i = 0; i < vl_s; i++) {
if ((i % 5) == 0) {
int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
uint32_t lane0 = -6 + (7 * i);
uint32_t lane1 = -7 + (7 * i);
uint32_t lane2 = -8 + (7 * i);
uint32_t lane3 = -9 + (7 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
}
}
// st4d { z31.d, z0.d, z1.d, z2.d }, SVE_MUL3
int vl_d_mul3 = vl_d - (vl_d % 3);
for (int i = 0; i < vl_d_mul3; i++) {
int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
uint64_t lane0 = 32 - (11 * i);
uint64_t lane1 = 33 - (11 * i);
uint64_t lane2 = 34 - (11 * i);
uint64_t lane3 = 35 - (11 * i);
MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
}
ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
// Check that we loaded back the expected values.
// st4b/ld4b
ASSERT_EQUAL_SVE(z3, z19);
ASSERT_EQUAL_SVE(z4, z20);
ASSERT_EQUAL_SVE(z5, z21);
ASSERT_EQUAL_SVE(z6, z22);
// st4h/ld4h
ASSERT_EQUAL_SVE(z7, z23);
ASSERT_EQUAL_SVE(z8, z24);
ASSERT_EQUAL_SVE(z9, z25);
ASSERT_EQUAL_SVE(z10, z26);
// st4w/ld4w
ASSERT_EQUAL_SVE(z11, z27);
ASSERT_EQUAL_SVE(z12, z28);
ASSERT_EQUAL_SVE(z13, z29);
ASSERT_EQUAL_SVE(z14, z30);
// st4d/ld4d
ASSERT_EQUAL_SVE(z15, z31);
ASSERT_EQUAL_SVE(z16, z0);
ASSERT_EQUAL_SVE(z17, z1);
ASSERT_EQUAL_SVE(z18, z2);
delete[] expected;
}
delete[] data;
}
TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Check that the simulator correctly interprets rn == 31 as sp.
// The indexing logic is the same regardless so we just check one load and
// store of each type.
// There are no pre- or post-indexing modes, so reserve space first.
__ ClaimVL(2 + 3 + 4);
__ Index(z0.VnB(), 42, 2);
__ Index(z1.VnB(), 43, 2);
__ Ptrue(p0.VnB(), SVE_VL7);
__ Rdvl(x0, 0);
__ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, x0));
__ Index(z4.VnH(), 42, 3);
__ Index(z5.VnH(), 43, 3);
__ Index(z6.VnH(), 44, 3);
__ Ptrue(p1.VnH(), SVE_POW2);
__ Rdvl(x1, 2);
__ Lsr(x1, x1, 1);
__ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, x1, LSL, 1));
__ Index(z8.VnS(), 42, 4);
__ Index(z9.VnS(), 43, 4);
__ Index(z10.VnS(), 44, 4);
__ Index(z11.VnS(), 45, 4);
__ Ptrue(p2.VnS());
__ Rdvl(x2, 2 + 3);
__ Lsr(x2, x2, 2);
__ St4w(z8.VnS(),
z9.VnS(),
z10.VnS(),
z11.VnS(),
p2,
SVEMemOperand(sp, x2, LSL, 2));
// Corresponding loads.
// We have to explicitly zero inactive lanes in the reference values because
// loads have zeroing predication.
__ Dup(z12.VnB(), 0);
__ Dup(z13.VnB(), 0);
__ Mov(z12.VnB(), p0.Merging(), z0.VnB());
__ Mov(z13.VnB(), p0.Merging(), z1.VnB());
__ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, x0));
__ Dup(z16.VnH(), 0);
__ Dup(z17.VnH(), 0);
__ Dup(z18.VnH(), 0);
__ Mov(z16.VnH(), p1.Merging(), z4.VnH());
__ Mov(z17.VnH(), p1.Merging(), z5.VnH());
__ Mov(z18.VnH(), p1.Merging(), z6.VnH());
__ Ld3h(z4.VnH(),
z5.VnH(),
z6.VnH(),
p1.Zeroing(),
SVEMemOperand(sp, x1, LSL, 1));
__ Dup(z20.VnS(), 0);
__ Dup(z21.VnS(), 0);
__ Dup(z22.VnS(), 0);
__ Dup(z23.VnS(), 0);
__ Mov(z20.VnS(), p2.Merging(), z8.VnS());
__ Mov(z21.VnS(), p2.Merging(), z9.VnS());
__ Mov(z22.VnS(), p2.Merging(), z10.VnS());
__ Mov(z23.VnS(), p2.Merging(), z11.VnS());
__ Ld4w(z8.VnS(),
z9.VnS(),
z10.VnS(),
z11.VnS(),
p2.Zeroing(),
SVEMemOperand(sp, x2, LSL, 2));
__ DropVL(2 + 3 + 4);
END();
if (CAN_RUN()) {
RUN();
// The most likely failure mode is the that simulator reads sp as xzr and
// crashes on execution. We already test the address calculations separately
// and sp doesn't change this, so just test that we load the values we
// stored.
// st2b/ld2b
ASSERT_EQUAL_SVE(z0, z12);
ASSERT_EQUAL_SVE(z1, z13);
// st3h/ld3h
ASSERT_EQUAL_SVE(z4, z16);
ASSERT_EQUAL_SVE(z5, z17);
ASSERT_EQUAL_SVE(z6, z18);
// st4h/ld4h
ASSERT_EQUAL_SVE(z8, z20);
ASSERT_EQUAL_SVE(z9, z21);
ASSERT_EQUAL_SVE(z10, z22);
ASSERT_EQUAL_SVE(z11, z23);
}
}
TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Check that the simulator correctly interprets rn == 31 as sp.
// The indexing logic is the same regardless so we just check one load and
// store of each type.
// There are no pre- or post-indexing modes, so reserve space first.
// Note that the stores fill in an order that allows each immediate to be a
// multiple of the number of registers.
__ ClaimVL(4 + 2 + 3);
__ Index(z0.VnB(), 42, 2);
__ Index(z1.VnB(), 43, 2);
__ Ptrue(p0.VnB(), SVE_POW2);
__ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, 4, SVE_MUL_VL));
__ Index(z4.VnH(), 42, 3);
__ Index(z5.VnH(), 43, 3);
__ Index(z6.VnH(), 44, 3);
__ Ptrue(p1.VnH(), SVE_VL7);
__ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, 6, SVE_MUL_VL));
__ Index(z8.VnS(), 42, 4);
__ Index(z9.VnS(), 43, 4);
__ Index(z10.VnS(), 44, 4);
__ Index(z11.VnS(), 45, 4);
__ Ptrue(p2.VnS());
__ St4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2, SVEMemOperand(sp));
// Corresponding loads.
// We have to explicitly zero inactive lanes in the reference values because
// loads have zeroing predication.
__ Dup(z12.VnB(), 0);
__ Dup(z13.VnB(), 0);
__ Mov(z12.VnB(), p0.Merging(), z0.VnB());
__ Mov(z13.VnB(), p0.Merging(), z1.VnB());
__ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, 4, SVE_MUL_VL));
__ Dup(z16.VnH(), 0);
__ Dup(z17.VnH(), 0);
__ Dup(z18.VnH(), 0);
__ Mov(z16.VnH(), p1.Merging(), z4.VnH());
__ Mov(z17.VnH(), p1.Merging(), z5.VnH());
__ Mov(z18.VnH(), p1.Merging(), z6.VnH());
__ Ld3h(z4.VnH(),
z5.VnH(),
z6.VnH(),
p1.Zeroing(),
SVEMemOperand(sp, 6, SVE_MUL_VL));
__ Dup(z20.VnS(), 0);
__ Dup(z21.VnS(), 0);
__ Dup(z22.VnS(), 0);
__ Dup(z23.VnS(), 0);
__ Mov(z20.VnS(), p2.Merging(), z8.VnS());
__ Mov(z21.VnS(), p2.Merging(), z9.VnS());
__ Mov(z22.VnS(), p2.Merging(), z10.VnS());
__ Mov(z23.VnS(), p2.Merging(), z11.VnS());
__ Ld4w(z8.VnS(),
z9.VnS(),
z10.VnS(),
z11.VnS(),
p2.Zeroing(),
SVEMemOperand(sp));
__ DropVL(4 + 2 + 3);
END();
if (CAN_RUN()) {
RUN();
// The most likely failure mode is the that simulator reads sp as xzr and
// crashes on execution. We already test the address calculations separately
// and sp doesn't change this, so just test that we load the values we
// stored.
// TODO: Actually do this, once loads are implemented.
}
}
// Fill the input buffer with arbitrary data. Meanwhile, assign random offsets
// from the base address of the buffer and corresponding addresses to the
// arguments if provided.
static void BufferFillingHelper(uint64_t data_ptr,
size_t buffer_size,
unsigned lane_size_in_bytes,
int lane_count,
uint64_t* offsets,
uint64_t* addresses = nullptr,
uint64_t* max_address = nullptr) {
// Use a fixed seed for nrand48() so that test runs are reproducible.
unsigned short seed[3] = {1, 2, 3}; // NOLINT(google-runtime-int)
// Fill a buffer with arbitrary data.
for (size_t i = 0; i < buffer_size; i++) {
uint8_t byte = nrand48(seed) & 0xff;
memcpy(reinterpret_cast<void*>(data_ptr + i), &byte, 1);
}
if (max_address != nullptr) {
*max_address = 0;
}
// Vectors of random addresses and offsets into the buffer.
for (int i = 0; i < lane_count; i++) {
uint64_t rnd = nrand48(seed);
// Limit the range to the set of completely-accessible elements in memory.
offsets[i] = rnd % (buffer_size - lane_size_in_bytes);
if ((addresses != nullptr) && (max_address != nullptr)) {
addresses[i] = data_ptr + offsets[i];
*max_address = std::max(*max_address, addresses[i]);
}
}
}
static void ScalarLoadHelper(MacroAssembler* masm,
Register dst,
Register addr,
int msize_in_bits,
bool is_signed) {
if (is_signed) {
switch (msize_in_bits) {
case kBRegSize:
masm->Ldrsb(dst, MemOperand(addr));
break;
case kHRegSize:
masm->Ldrsh(dst, MemOperand(addr));
break;
case kWRegSize:
masm->Ldrsw(dst, MemOperand(addr));
break;
default:
VIXL_UNIMPLEMENTED();
break;
}
} else {
switch (msize_in_bits) {
case kBRegSize:
masm->Ldrb(dst, MemOperand(addr));
break;
case kHRegSize:
masm->Ldrh(dst, MemOperand(addr));
break;
case kWRegSize:
masm->Ldr(dst.W(), MemOperand(addr));
break;
case kXRegSize:
masm->Ldr(dst, MemOperand(addr));
break;
default:
VIXL_UNIMPLEMENTED();
break;
}
}
}
// Generate a reference result using scalar loads.
// For now this helper doesn't save and restore the caller registers.
// Clobber register z30, x28, x29 and p7.
template <size_t N>
static void ScalarLoadHelper(MacroAssembler* masm,
int vl,
const uint64_t (&addresses)[N],
const ZRegister& zt_ref,
const PRegisterZ& pg,
unsigned esize_in_bits,
unsigned msize_in_bits,
bool is_signed) {
unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
ZRegister lane_numbers = z30.WithLaneSize(esize_in_bits);
masm->Index(lane_numbers, 0, 1);
masm->Dup(zt_ref, 0);
for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
masm->Mov(x29, addresses[N - i - 1]);
Register rt(28, std::min(std::max(esize_in_bits, kSRegSize), kDRegSize));
ScalarLoadHelper(masm, rt, x29, msize_in_bits, is_signed);
// Emulate predication.
masm->Cmpeq(p7.WithLaneSize(esize_in_bits), pg, lane_numbers, i);
masm->Cpy(zt_ref, p7.Merging(), rt);
}
}
typedef void (MacroAssembler::*Ld1Macro)(const ZRegister& zt,
const PRegisterZ& pg,
const SVEMemOperand& addr);
template <typename T>
static void Ldff1Helper(Test* config,
uintptr_t data,
unsigned msize_in_bits,
unsigned esize_in_bits,
CPURegister::RegisterType base_type,
Ld1Macro ldff1,
Ld1Macro ld1,
T mod,
bool scale = false) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
size_t page_size = sysconf(_SC_PAGE_SIZE);
VIXL_ASSERT(page_size > static_cast<size_t>(vl));
unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
unsigned msize_in_bytes_log2 = std::log2(msize_in_bytes);
VIXL_ASSERT(msize_in_bits <= esize_in_bits);
PRegister all = p7;
__ Ptrue(all.VnB());
size_t offset_modifier = 0;
// The highest address at which a load stopped. Every FF load should fault at
// `data + page_size`, so this value should not exceed that value. However,
// the architecture allows fault-tolerant loads to fault arbitrarily, so the
// real value may be lower.
//
// This is used to check that the `mprotect` above really does make the second
// page inaccessible, and that the resulting FFR from each load reflects that.
Register limit = x22;
__ Mov(limit, 0);
// If the FFR grows unexpectedly, we increment this register by the
// difference. FFR should never grow, except when explicitly set.
Register ffr_grow_count = x23;
__ Mov(ffr_grow_count, 0);
// Set the offset so that the load is guaranteed to start in the
// accessible page, but end in the inaccessible one.
VIXL_ASSERT((page_size % msize_in_bytes) == 0);
VIXL_ASSERT((vl % msize_in_bytes) == 0);
size_t elements_per_page = page_size / msize_in_bytes;
size_t elements_per_access = vl / esize_in_bytes;
size_t min_offset = (elements_per_page - elements_per_access) + 1;
size_t max_offset = elements_per_page - 1;
size_t offset =
min_offset + (offset_modifier % (max_offset - min_offset + 1));
offset_modifier++;
__ Setffr();
__ Mov(x20, data);
__ Mov(x21, offset);
if (base_type == CPURegister::kRegister) {
// Scalar-plus-scalar mode.
VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
(static_cast<int>(mod) == NO_SHIFT));
(masm.*ldff1)(z0.WithLaneSize(esize_in_bits),
all.Zeroing(),
SVEMemOperand(x20, x21, mod, msize_in_bytes_log2));
} else {
VIXL_ASSERT(base_type == CPURegister::kZRegister);
int offs_size;
bool offs_is_unsigned;
if (std::is_same<T, vixl::aarch64::Extend>::value) {
// Scalar-plus-vector mode with 32-bit optional unpacked or upacked, and
// unscaled or scaled offset.
VIXL_ASSERT((static_cast<int>(mod) == SXTW) ||
(static_cast<int>(mod) == UXTW));
if (scale == true) {
// Gather first-fault bytes load doesn't support scaled offset.
VIXL_ASSERT(msize_in_bits != kBRegSize);
}
offs_is_unsigned = (static_cast<int>(mod) == UXTW) ? true : false;
offs_size = kSRegSize;
} else {
// Scalar-plus-vector mode with 64-bit unscaled or scaled offset.
VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
(static_cast<int>(mod) == NO_SHIFT));
offs_is_unsigned = false;
offs_size = kDRegSize;
}
// For generating the pattern of "base address + index << shift".
// In case of unscaled-offset operation, use `msize_in_bytes` be an offset
// of each decreasing memory accesses. otherwise, decreases the indexes by 1
// and then scale it by the shift value.
int shift = (scale == true) ? msize_in_bytes_log2 : 0;
int index_offset = msize_in_bytes >> shift;
VIXL_ASSERT(index_offset > 0);
uint64_t index = 0;
uint64_t base_address = 0;
if (offs_is_unsigned == true) {
// Base address.
base_address = data;
// Maximum unsigned positive index.
index = page_size >> shift;
} else {
// Base address.
base_address = data + (2 * page_size);
// Maximum unsigned positive index.
uint64_t uint_e_max =
(esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
index = uint_e_max - (page_size >> shift) + 1;
}
__ Mov(x19, base_address);
if ((offs_size == kSRegSize) && (esize_in_bits == kDRegSize)) {
// In this case, the index values are optionally sign or zero-extended
// from 32 to 64 bits, assign a convenient value to the top 32 bits to
// ensure only the low 32 bits be the index values.
index |= 0x1234567800000000;
}
index -= index_offset * (elements_per_access - 1);
__ Index(z17.WithLaneSize(esize_in_bits), index, index_offset);
// Scalar plus vector mode.
(masm.*
ldff1)(z0.WithLaneSize(esize_in_bits),
all.Zeroing(),
SVEMemOperand(x19, z17.WithLaneSize(esize_in_bits), mod, shift));
}
__ Rdffrs(p0.VnB(), all.Zeroing());
// Execute another Ldff1 with no offset, so that every element could be
// read. It should respect FFR, and load no more than we loaded the
// first time.
(masm.*
ldff1)(z16.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x20));
__ Rdffrs(p1.VnB(), all.Zeroing());
__ Cntp(x0, all, p1.VnB());
__ Uqdecp(x0, p0.VnB());
__ Add(ffr_grow_count, ffr_grow_count, x0);
// Use the FFR to predicate the normal load. If it wasn't properly set,
// the normal load will abort.
(masm.*ld1)(z16.WithLaneSize(esize_in_bits),
p0.Zeroing(),
SVEMemOperand(x20, x21, LSL, msize_in_bytes_log2));
// Work out the address after the one that was just accessed.
__ Incp(x21, p0.WithLaneSize(esize_in_bits));
__ Add(x0, x20, Operand(x21, LSL, msize_in_bytes_log2));
__ Cmp(limit, x0);
__ Csel(limit, limit, x0, hs);
// Clear lanes inactive in FFR. These have an undefined result.
__ Not(p0.VnB(), all.Zeroing(), p0.VnB());
__ Mov(z0.WithLaneSize(esize_in_bits), p0.Merging(), 0);
END();
if (CAN_RUN()) {
RUN();
uintptr_t expected_limit = data + page_size;
uintptr_t measured_limit = core.xreg(limit.GetCode());
VIXL_CHECK(measured_limit <= expected_limit);
if (measured_limit < expected_limit) {
// We can't fail the test for this case, but a warning is helpful for
// manually-run tests.
printf(
"WARNING: All fault-tolerant loads detected faults before the\n"
"expected limit. This is architecturally possible, but improbable,\n"
"and could be a symptom of another problem.\n");
}
ASSERT_EQUAL_64(0, ffr_grow_count);
ASSERT_EQUAL_SVE(z0.WithLaneSize(esize_in_bits),
z16.WithLaneSize(esize_in_bits));
}
}
TEST_SVE(sve_ldff1_scalar_plus_scalar) {
size_t page_size = sysconf(_SC_PAGE_SIZE);
VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
// Allocate two pages, then mprotect the second one to make it inaccessible.
uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
page_size * 2,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1,
0));
mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
// Fill the accessible page with arbitrary data.
for (size_t i = 0; i < page_size; i++) {
// Reverse bits so we get a mixture of positive and negative values.
uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
}
auto ldff1_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
config,
data,
std::placeholders::_1,
std::placeholders::_2,
CPURegister::kRegister,
std::placeholders::_3,
std::placeholders::_4,
NO_SHIFT,
false);
Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
Ld1Macro ld1b = &MacroAssembler::Ld1b;
ldff1_unscaled_offset_helper(kBRegSize, kBRegSize, ldff1b, ld1b);
ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1b, ld1b);
ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1b, ld1b);
ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1b, ld1b);
Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1sb, ld1sb);
ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1sb, ld1sb);
ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1sb, ld1sb);
auto ldff1_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
config,
data,
std::placeholders::_1,
std::placeholders::_2,
CPURegister::kRegister,
std::placeholders::_3,
std::placeholders::_4,
LSL,
true);
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
Ld1Macro ld1h = &MacroAssembler::Ld1h;
ldff1_scaled_offset_helper(kHRegSize, kHRegSize, ldff1h, ld1h);
ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1h, ld1h);
ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1h, ld1h);
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
Ld1Macro ld1w = &MacroAssembler::Ld1w;
ldff1_scaled_offset_helper(kSRegSize, kSRegSize, ldff1w, ld1w);
ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1w, ld1w);
Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
Ld1Macro ld1d = &MacroAssembler::Ld1d;
ldff1_scaled_offset_helper(kDRegSize, kDRegSize, ldff1d, ld1d);
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1sh, ld1sh);
ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1sh, ld1sh);
Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1sw, ld1sw);
munmap(reinterpret_cast<void*>(data), page_size * 2);
}
static void sve_ldff1_scalar_plus_vector_32_scaled_offset(Test* config,
uintptr_t data) {
auto ldff1_32_scaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
config,
data,
std::placeholders::_1,
kSRegSize,
CPURegister::kZRegister,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4,
true);
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
Ld1Macro ld1h = &MacroAssembler::Ld1h;
ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
Ld1Macro ld1w = &MacroAssembler::Ld1w;
ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
}
static void sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test* config,
uintptr_t data) {
auto ldff1_32_unscaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
config,
data,
std::placeholders::_1,
kSRegSize,
CPURegister::kZRegister,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4,
false);
Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
Ld1Macro ld1b = &MacroAssembler::Ld1b;
ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
Ld1Macro ld1h = &MacroAssembler::Ld1h;
ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
Ld1Macro ld1w = &MacroAssembler::Ld1w;
ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
}
static void sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(
Test* config, uintptr_t data) {
auto ldff1_32_unpacked_scaled_offset_helper =
std::bind(&Ldff1Helper<Extend>,
config,
data,
std::placeholders::_1,
kDRegSize,
CPURegister::kZRegister,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4,
true);
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
Ld1Macro ld1h = &MacroAssembler::Ld1h;
ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
Ld1Macro ld1w = &MacroAssembler::Ld1w;
ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
Ld1Macro ld1d = &MacroAssembler::Ld1d;
ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
}
static void sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(
Test* config, uintptr_t data) {
auto ldff1_32_unpacked_unscaled_offset_helper =
std::bind(&Ldff1Helper<Extend>,
config,
data,
std::placeholders::_1,
kDRegSize,
CPURegister::kZRegister,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4,
false);
Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
Ld1Macro ld1b = &MacroAssembler::Ld1b;
ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
Ld1Macro ld1h = &MacroAssembler::Ld1h;
ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
Ld1Macro ld1w = &MacroAssembler::Ld1w;
ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
Ld1Macro ld1d = &MacroAssembler::Ld1d;
ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
}
static void sve_ldff1_scalar_plus_vector_64_scaled_offset(Test* config,
uintptr_t data) {
auto ldff1_64_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
config,
data,
std::placeholders::_1,
kDRegSize,
CPURegister::kZRegister,
std::placeholders::_2,
std::placeholders::_3,
LSL,
true);
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
Ld1Macro ld1h = &MacroAssembler::Ld1h;
ldff1_64_scaled_offset_helper(kHRegSize, ldff1h, ld1h);
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
Ld1Macro ld1w = &MacroAssembler::Ld1w;
ldff1_64_scaled_offset_helper(kSRegSize, ldff1w, ld1w);
Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
Ld1Macro ld1d = &MacroAssembler::Ld1d;
ldff1_64_scaled_offset_helper(kDRegSize, ldff1d, ld1d);
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
ldff1_64_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
ldff1_64_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
}
static void sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test* config,
uintptr_t data) {
auto ldff1_64_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
config,
data,
std::placeholders::_1,
kDRegSize,
CPURegister::kZRegister,
std::placeholders::_2,
std::placeholders::_3,
NO_SHIFT,
false);
Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
Ld1Macro ld1b = &MacroAssembler::Ld1b;
ldff1_64_unscaled_offset_helper(kBRegSize, ldff1b, ld1b);
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
Ld1Macro ld1h = &MacroAssembler::Ld1h;
ldff1_64_unscaled_offset_helper(kHRegSize, ldff1h, ld1h);
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
Ld1Macro ld1w = &MacroAssembler::Ld1w;
ldff1_64_unscaled_offset_helper(kSRegSize, ldff1w, ld1w);
Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
Ld1Macro ld1d = &MacroAssembler::Ld1d;
ldff1_64_unscaled_offset_helper(kDRegSize, ldff1d, ld1d);
Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
ldff1_64_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb);
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
ldff1_64_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
ldff1_64_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
}
TEST_SVE(sve_ldff1_scalar_plus_vector) {
size_t page_size = sysconf(_SC_PAGE_SIZE);
VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
// Allocate two pages, then mprotect the second one to make it inaccessible.
uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
page_size * 2,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1,
0));
mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
// Fill the accessible page with arbitrary data.
for (size_t i = 0; i < page_size; i++) {
// Reverse bits so we get a mixture of positive and negative values.
uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
}
sve_ldff1_scalar_plus_vector_32_scaled_offset(config, data);
sve_ldff1_scalar_plus_vector_32_unscaled_offset(config, data);
sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(config, data);
sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(config, data);
sve_ldff1_scalar_plus_vector_64_scaled_offset(config, data);
sve_ldff1_scalar_plus_vector_64_unscaled_offset(config, data);
munmap(reinterpret_cast<void*>(data), page_size * 2);
}
TEST_SVE(sve_ldnf1) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
CPUFeatures::kNEON,
CPUFeatures::kFP);
START();
size_t page_size = sysconf(_SC_PAGE_SIZE);
VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
// Allocate two pages, fill them with data, then mprotect the second one to
// make it inaccessible.
uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
page_size * 2,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1,
0));
// Fill the pages with arbitrary data.
for (size_t i = 0; i < page_size; i++) {
// Reverse bits so we get a mixture of positive and negative values.
uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
}
mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
__ Setffr();
__ Ptrue(p0.VnB());
__ Dup(z10.VnB(), 0);
// Move an address that points to the last unprotected eight bytes.
__ Mov(x0, data + page_size - (kQRegSizeInBytes / kBRegSizeInBytes) / 2);
// Load, non-faulting, a vector of bytes from x0. At most, eight bytes will be
// loaded, the rest being in a protected page.
__ Ldnf1b(z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
__ Rdffr(p1.VnB());
__ Setffr();
// Create references using the FFR value in p1 to zero the undefined lanes.
__ Sel(z0.VnB(), p1, z0.VnB(), z10.VnB());
__ Ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0));
// Repeat for larger elements and different addresses, giving different FFR
// results.
__ Add(x1, x0, 1);
__ Ldnf1h(z1.VnH(), p0.Zeroing(), SVEMemOperand(x1));
__ Rdffr(p1.VnB());
__ Setffr();
__ Sel(z1.VnH(), p1, z1.VnH(), z10.VnH());
__ Ld1h(z21.VnH(), p1.Zeroing(), SVEMemOperand(x1));
__ Add(x1, x0, 2);
__ Ldnf1w(z2.VnS(), p0.Zeroing(), SVEMemOperand(x1));
__ Rdffr(p1.VnB());
__ Setffr();
__ Sel(z2.VnS(), p1, z2.VnS(), z10.VnS());
__ Ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x1));
__ Sub(x1, x0, 1);
__ Ldnf1d(z3.VnD(), p0.Zeroing(), SVEMemOperand(x1));
__ Rdffr(p1.VnB());
__ Setffr();
__ Sel(z3.VnD(), p1, z3.VnD(), z10.VnD());
__ Ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x1));
// Load from previous VL-sized area of memory. All of this should be in the
// accessible page.
__ Ldnf1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
__ Rdffr(p1.VnB());
__ Setffr();
__ Sel(z4.VnB(), p1, z4.VnB(), z10.VnB());
__ Ld1b(z24.VnB(), p1.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
// Repeat partial load for larger element size.
__ Mov(x0, data + page_size - (kQRegSizeInBytes / kSRegSizeInBytes) / 2);
__ Ldnf1b(z5.VnS(), p0.Zeroing(), SVEMemOperand(x0));
__ Rdffr(p1.VnB());
__ Setffr();
__ Sel(z5.VnS(), p1, z5.VnS(), z10.VnS());
__ Ld1b(z25.VnS(), p1.Zeroing(), SVEMemOperand(x0));
// Repeat for sign extension.
__ Mov(x0, data + page_size - (kQRegSizeInBytes / kHRegSizeInBytes) / 2);
__ Ldnf1sb(z6.VnH(), p0.Zeroing(), SVEMemOperand(x0));
__ Rdffr(p1.VnB());
__ Setffr();
__ Sel(z6.VnH(), p1, z6.VnH(), z10.VnH());
__ Ld1sb(z26.VnH(), p1.Zeroing(), SVEMemOperand(x0));
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z20, z0);
ASSERT_EQUAL_SVE(z21, z1);
ASSERT_EQUAL_SVE(z22, z2);
ASSERT_EQUAL_SVE(z23, z3);
ASSERT_EQUAL_SVE(z24, z4);
ASSERT_EQUAL_SVE(z25, z5);
ASSERT_EQUAL_SVE(z26, z6);
}
munmap(reinterpret_cast<void*>(data), page_size * 2);
}
// Emphasis on test if the modifiers are propagated and simulated correctly.
TEST_SVE(sve_ldff1_regression_test) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
size_t page_size = sysconf(_SC_PAGE_SIZE);
VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
page_size * 2,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1,
0));
uintptr_t middle = data + page_size;
// Fill the accessible page with arbitrary data.
for (size_t i = 0; i < page_size; i++) {
// Reverse bits so we get a mixture of positive and negative values.
uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
// Make one bit roughly different in every byte and copy the bytes in the
// reverse direction that convenient to verifying the loads in negative
// indexes.
byte += 1;
memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
}
PRegister all = p6;
__ Ptrue(all.VnB());
__ Mov(x0, middle);
__ Index(z31.VnS(), 0, 3);
__ Neg(z30.VnS(), z31.VnS());
__ Setffr();
// Scalar plus vector 32 unscaled offset
__ Ldff1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
__ Ldff1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
__ Ldff1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
__ Ldff1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
__ Ldff1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
// Scalar plus vector 32 scaled offset
__ Ldff1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
__ Ldff1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
__ Ldff1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
__ Index(z31.VnD(), 0, 3);
__ Neg(z30.VnD(), z31.VnD());
// Ensure only the low 32 bits are used for the testing with positive index
// values. It also test if the indexes are treated as positive in `uxtw` form.
__ Mov(x3, 0x8000000080000000);
__ Dup(z28.VnD(), x3);
__ Sub(x2, x0, 0x80000000);
__ Add(z29.VnD(), z31.VnD(), z28.VnD());
// Scalar plus vector 32 unpacked unscaled offset
__ Ldff1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
__ Ldff1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
__ Ldff1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
__ Ldff1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
__ Ldff1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
__ Ldff1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
// Scalar plus vector 32 unpacked scaled offset
__ Ldff1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
__ Ldff1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
__ Ldff1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
__ Ldff1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
__ Ldff1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
__ Sub(x0, x0, x3);
// Note that the positive indexes has been added by `0x8000000080000000`. The
// wrong address will be accessed if the address is treated as negative.
// Scalar plus vector 64 unscaled offset
__ Ldff1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
__ Ldff1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
__ Ldff1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
__ Ldff1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
__ Ldff1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
// Scalar plus vector 64 scaled offset
__ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
__ Add(z30.VnD(), z31.VnD(), z29.VnD());
__ Ldff1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
__ Ldff1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
__ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
__ Add(z30.VnD(), z31.VnD(), z29.VnD());
__ Ldff1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
__ Ldff1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
__ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
__ Add(z30.VnD(), z31.VnD(), z29.VnD());
__ Ldff1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
__ Rdffr(p1.VnB());
__ Cntp(x10, all, p1.VnB());
END();
if (CAN_RUN()) {
RUN();
int64_t loaded_data_in_bytes = core.xreg(x10.GetCode());
// Only check 128 bits in this test.
if (loaded_data_in_bytes < kQRegSizeInBytes) {
// Report a warning when we hit fault-tolerant loads before all expected
// loads performed.
printf(
"WARNING: Fault-tolerant loads detected faults before the "
"expected loads completed.\n");
return;
}
// Scalar plus vector 32 unscaled offset
uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
// Scalar plus vector 32 scaled offset
uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
// Scalar plus vector 32 unpacked unscaled offset
uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
// Scalar plus vector 32 unpacked scaled offset
uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
// Scalar plus vector 64 unscaled offset
uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
// Scalar plus vector 64 scaled offset
ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
}
}
// Emphasis on test if the modifiers are propagated and simulated correctly.
TEST_SVE(sve_ld1_regression_test) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
size_t page_size = sysconf(_SC_PAGE_SIZE);
VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
page_size * 2,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1,
0));
uintptr_t middle = data + page_size;
// Fill the accessible page with arbitrary data.
for (size_t i = 0; i < page_size; i++) {
// Reverse bits so we get a mixture of positive and negative values.
uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
// Make one bit roughly different in every byte and copy the bytes in the
// reverse direction that convenient to verifying the loads in negative
// indexes.
byte += 1;
memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
}
PRegister all = p6;
__ Ptrue(all.VnB());
__ Mov(x0, middle);
__ Index(z31.VnS(), 0, 3);
__ Neg(z30.VnS(), z31.VnS());
// Scalar plus vector 32 unscaled offset
__ Ld1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
__ Ld1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
__ Ld1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
__ Ld1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
__ Ld1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
// Scalar plus vector 32 scaled offset
__ Ld1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
__ Ld1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
__ Ld1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
__ Index(z31.VnD(), 0, 3);
__ Neg(z30.VnD(), z31.VnD());
// Ensure only the low 32 bits are used for the testing with positive index
// values. It also test if the indexes are treated as positive in `uxtw` form.
__ Mov(x3, 0x8000000080000000);
__ Dup(z28.VnD(), x3);
__ Sub(x2, x0, 0x80000000);
__ Add(z29.VnD(), z31.VnD(), z28.VnD());
// Scalar plus vector 32 unpacked unscaled offset
__ Ld1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
__ Ld1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
__ Ld1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
__ Ld1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
__ Ld1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
__ Ld1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
// Scalar plus vector 32 unpacked scaled offset
__ Ld1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
__ Ld1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
__ Ld1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
__ Ld1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
__ Ld1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
__ Sub(x0, x0, x3);
// Note that the positive indexes has been added by `0x8000000080000000`. The
// wrong address will be accessed if the address is treated as negative.
// Scalar plus vector 64 unscaled offset
__ Ld1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
__ Ld1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
__ Ld1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
__ Ld1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
__ Ld1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
// Scalar plus vector 64 scaled offset
__ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
__ Add(z30.VnD(), z31.VnD(), z29.VnD());
__ Ld1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
__ Ld1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
__ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
__ Add(z30.VnD(), z31.VnD(), z29.VnD());
__ Ld1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
__ Ld1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
__ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
__ Add(z30.VnD(), z31.VnD(), z29.VnD());
__ Ld1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
END();
if (CAN_RUN()) {
RUN();
// Scalar plus vector 32 unscaled offset
uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
// Scalar plus vector 32 scaled offset
uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
// Scalar plus vector 32 unpacked unscaled offset
uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
// Scalar plus vector 32 unpacked scaled offset
uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
// Scalar plus vector 64 unscaled offset
uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
// Scalar plus vector 64 scaled offset
ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
}
}
// Test gather loads by comparing them with the result of a set of equivalent
// scalar loads.
template <typename T>
static void GatherLoadScalarPlusVectorHelper(Test* config,
unsigned msize_in_bits,
unsigned esize_in_bits,
Ld1Macro ld1,
Ld1Macro ldff1,
T mod,
bool is_signed,
bool is_scaled) {
// SVE supports 32- and 64-bit addressing for gather loads.
VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
int vl = config->sve_vl_in_bytes();
uint64_t addresses[kMaxLaneCount];
uint64_t offsets[kMaxLaneCount];
uint64_t max_address = 0;
uint64_t buffer_size = vl * 64;
uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
// Fill the buffer with arbitrary data. Meanwhile, create the random addresses
// and offsets into the buffer placed in the argument list.
BufferFillingHelper(data,
buffer_size,
msize_in_bytes,
kMaxLaneCount,
offsets,
addresses,
&max_address);
ZRegister zn = z0.WithLaneSize(esize_in_bits);
ZRegister zt_ref = z1.WithLaneSize(esize_in_bits);
ZRegister zt = z2.WithLaneSize(esize_in_bits);
ZRegister zt_ff = z3.WithLaneSize(esize_in_bits);
PRegisterWithLaneSize pg_ff = p1.WithLaneSize(esize_in_bits);
PRegisterWithLaneSize pg_diff = p2.WithLaneSize(esize_in_bits);
int shift = 0;
if (is_scaled) {
shift = std::log2(msize_in_bytes);
for (unsigned i = 0; i < kMaxLaneCount; i++) {
// Ensure the offsets are the multiple of the scale factor of the
// operation.
offsets[i] = (offsets[i] >> shift) << shift;
addresses[i] = data + offsets[i];
}
}
PRegister all = p6;
__ Ptrue(all.WithLaneSize(esize_in_bits));
PRegisterZ pg = p0.Zeroing();
Initialise(&masm,
pg,
0x9abcdef012345678,
0xabcdef0123456789,
0xf4f3f1f0fefdfcfa,
0xf9f8f6f5f3f2f1ff);
__ Mov(x0, data);
// Generate a reference result for scalar-plus-scalar form using scalar loads.
ScalarLoadHelper(&masm,
vl,
addresses,
zt_ref,
pg,
esize_in_bits,
msize_in_bits,
is_signed);
InsrHelper(&masm, zn, offsets);
if (is_scaled) {
// Scale down the offsets if testing scaled-offset operation.
__ Lsr(zn, zn, shift);
}
(masm.*ld1)(zt, pg, SVEMemOperand(x0, zn, mod, shift));
Register ffr_check_count = x17;
__ Mov(ffr_check_count, 0);
// Test the data correctness in which the data gather load from different
// addresses. The first-fault behavior test is emphasized in `Ldff1Helper`.
__ Setffr();
(masm.*ldff1)(zt_ff, pg, SVEMemOperand(x0, zn, mod, shift));
// Compare these two vector register and place the different to
// `ffr_check_count`.
__ Rdffrs(pg_ff.VnB(), all.Zeroing());
__ Cmpeq(pg_diff, all.Zeroing(), zt_ref, zt_ff);
__ Eor(pg_diff.VnB(), all.Zeroing(), pg_diff.VnB(), pg_ff.VnB());
__ Incp(ffr_check_count, pg_diff);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zt_ref, zt);
ASSERT_EQUAL_64(0, ffr_check_count);
}
free(reinterpret_cast<void*>(data));
}
// Test gather loads by comparing them with the result of a set of equivalent
// scalar loads.
template <typename F>
static void GatherLoadScalarPlusScalarOrImmHelper(Test* config,
unsigned msize_in_bits,
unsigned esize_in_bits,
F sve_ld1,
bool is_signed) {
// SVE supports 32- and 64-bit addressing for gather loads.
VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
int vl = config->sve_vl_in_bytes();
uint64_t addresses[kMaxLaneCount];
uint64_t offsets[kMaxLaneCount];
uint64_t max_address = 0;
uint64_t buffer_size = vl * 64;
uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
BufferFillingHelper(data,
buffer_size,
msize_in_bytes,
kMaxLaneCount,
offsets,
addresses,
&max_address);
// Maximised offsets, to ensure that the address calculation is modulo-2^64,
// and that the vector addresses are not sign-extended.
uint64_t uint_e_max = (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
uint64_t maxed_offsets[kMaxLaneCount];
uint64_t maxed_offsets_imm = max_address - uint_e_max;
for (unsigned i = 0; i < kMaxLaneCount; i++) {
maxed_offsets[i] = addresses[i] - maxed_offsets_imm;
}
ZRegister zn = z0.WithLaneSize(esize_in_bits);
ZRegister zt_addresses = z1.WithLaneSize(esize_in_bits);
ZRegister zt_offsets = z2.WithLaneSize(esize_in_bits);
ZRegister zt_maxed = z3.WithLaneSize(esize_in_bits);
ZRegister zt_ref = z4.WithLaneSize(esize_in_bits);
PRegisterZ pg = p0.Zeroing();
Initialise(&masm,
pg,
0x9abcdef012345678,
0xabcdef0123456789,
0xf4f3f1f0fefdfcfa,
0xf9f8f6f5f3f2f0ff);
// Execute each load.
if (esize_in_bits == kDRegSize) {
// Only test `addresses` if we can use 64-bit pointers. InsrHelper will fail
// if any value won't fit in a lane of zn.
InsrHelper(&masm, zn, addresses);
(masm.*sve_ld1)(zt_addresses, pg, SVEMemOperand(zn));
}
InsrHelper(&masm, zn, offsets);
(masm.*sve_ld1)(zt_offsets, pg, SVEMemOperand(zn, data));
InsrHelper(&masm, zn, maxed_offsets);
(masm.*sve_ld1)(zt_maxed, pg, SVEMemOperand(zn, maxed_offsets_imm));
// Generate a reference result using scalar loads.
ScalarLoadHelper(&masm,
vl,
addresses,
zt_ref,
pg,
esize_in_bits,
msize_in_bits,
is_signed);
END();
if (CAN_RUN()) {
RUN();
if (esize_in_bits == kDRegSize) {
ASSERT_EQUAL_SVE(zt_ref, zt_addresses);
}
ASSERT_EQUAL_SVE(zt_ref, zt_offsets);
ASSERT_EQUAL_SVE(zt_ref, zt_maxed);
}
free(reinterpret_cast<void*>(data));
}
TEST_SVE(sve_ld1b_64bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kBRegSize,
kDRegSize,
&MacroAssembler::Ld1b,
false);
}
TEST_SVE(sve_ld1h_64bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kHRegSize,
kDRegSize,
&MacroAssembler::Ld1h,
false);
}
TEST_SVE(sve_ld1w_64bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kSRegSize,
kDRegSize,
&MacroAssembler::Ld1w,
false);
}
TEST_SVE(sve_ld1d_64bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kDRegSize,
kDRegSize,
&MacroAssembler::Ld1d,
false);
}
TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kBRegSize,
kDRegSize,
&MacroAssembler::Ld1sb,
true);
}
TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kHRegSize,
kDRegSize,
&MacroAssembler::Ld1sh,
true);
}
TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kSRegSize,
kDRegSize,
&MacroAssembler::Ld1sw,
true);
}
TEST_SVE(sve_ld1b_32bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kBRegSize,
kSRegSize,
&MacroAssembler::Ld1b,
false);
}
TEST_SVE(sve_ld1h_32bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kHRegSize,
kSRegSize,
&MacroAssembler::Ld1h,
false);
}
TEST_SVE(sve_ld1w_32bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kSRegSize,
kSRegSize,
&MacroAssembler::Ld1w,
false);
}
TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kBRegSize,
kSRegSize,
&MacroAssembler::Ld1sb,
true);
}
TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate) {
GatherLoadScalarPlusScalarOrImmHelper(config,
kHRegSize,
kSRegSize,
&MacroAssembler::Ld1sh,
true);
}
TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset) {
auto ld1_32_scaled_offset_helper =
std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
config,
std::placeholders::_1,
kSRegSize,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4,
std::placeholders::_5,
true);
Ld1Macro ld1h = &MacroAssembler::Ld1h;
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
Ld1Macro ld1w = &MacroAssembler::Ld1w;
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
}
TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset) {
auto ld1_32_unscaled_offset_helper =
std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
config,
std::placeholders::_1,
kSRegSize,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4,
std::placeholders::_5,
false);
Ld1Macro ld1b = &MacroAssembler::Ld1b;
Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, UXTW, false);
ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, SXTW, false);
Ld1Macro ld1h = &MacroAssembler::Ld1h;
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
Ld1Macro ld1w = &MacroAssembler::Ld1w;
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, UXTW, true);
ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, SXTW, true);
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
}
TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset) {
auto ld1_32_unpacked_scaled_offset_helper =
std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
config,
std::placeholders::_1,
kDRegSize,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4,
std::placeholders::_5,
true);
Ld1Macro ld1h = &MacroAssembler::Ld1h;
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
Ld1Macro ld1w = &MacroAssembler::Ld1w;
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
Ld1Macro ld1d = &MacroAssembler::Ld1d;
Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
}
TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset) {
auto ld1_32_unpacked_unscaled_offset_helper =
std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
config,
std::placeholders::_1,
kDRegSize,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4,
std::placeholders::_5,
false);
Ld1Macro ld1h = &MacroAssembler::Ld1h;
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
Ld1Macro ld1w = &MacroAssembler::Ld1w;
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
Ld1Macro ld1d = &MacroAssembler::Ld1d;
Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
}
TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset) {
auto ld1_64_scaled_offset_helper =
std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
config,
std::placeholders::_1,
kDRegSize,
std::placeholders::_2,
std::placeholders::_3,
LSL,
std::placeholders::_4,
true);
Ld1Macro ld1h = &MacroAssembler::Ld1h;
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
ld1_64_scaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
Ld1Macro ld1w = &MacroAssembler::Ld1w;
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
ld1_64_scaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
Ld1Macro ld1d = &MacroAssembler::Ld1d;
Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
ld1_64_scaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
ld1_64_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
ld1_64_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
}
TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset) {
auto ld1_64_unscaled_offset_helper =
std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
config,
std::placeholders::_1,
kDRegSize,
std::placeholders::_2,
std::placeholders::_3,
NO_SHIFT,
std::placeholders::_4,
false);
Ld1Macro ld1b = &MacroAssembler::Ld1b;
Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
ld1_64_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, false);
Ld1Macro ld1h = &MacroAssembler::Ld1h;
Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
ld1_64_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
Ld1Macro ld1w = &MacroAssembler::Ld1w;
Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
ld1_64_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
Ld1Macro ld1d = &MacroAssembler::Ld1d;
Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
ld1_64_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
ld1_64_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, true);
Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
ld1_64_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
ld1_64_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
}
TEST_SVE(sve_ldnt1) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int data_size = kZRegMaxSizeInBytes * 16;
uint8_t* data = new uint8_t[data_size];
for (int i = 0; i < data_size; i++) {
data[i] = i & 0xff;
}
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
__ Ptrue(p0.VnB());
__ Punpklo(p1.VnH(), p0.VnB());
__ Punpklo(p2.VnH(), p1.VnB());
__ Punpklo(p3.VnH(), p2.VnB());
__ Punpklo(p4.VnH(), p3.VnB());
__ Mov(x1, 42);
__ Ld1b(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
__ Ldnt1b(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
__ Mov(x1, -21);
__ Ld1h(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
__ Ldnt1h(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
__ Mov(x1, 10);
__ Ld1w(z4.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
__ Ldnt1w(z5.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
__ Mov(x1, -5);
__ Ld1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
__ Ldnt1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
__ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
__ Ldnt1b(z9.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
__ Ld1h(z10.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
__ Ldnt1h(z11.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
__ Ld1w(z12.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
__ Ldnt1w(z13.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
__ Ld1d(z14.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
__ Ldnt1d(z15.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z0, z1);
ASSERT_EQUAL_SVE(z2, z3);
ASSERT_EQUAL_SVE(z4, z5);
ASSERT_EQUAL_SVE(z6, z7);
ASSERT_EQUAL_SVE(z8, z9);
ASSERT_EQUAL_SVE(z10, z11);
ASSERT_EQUAL_SVE(z12, z13);
ASSERT_EQUAL_SVE(z14, z15);
}
}
TEST_SVE(sve_stnt1) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int data_size = kZRegMaxSizeInBytes * 16;
uint8_t* data = new uint8_t[data_size];
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
__ Ptrue(p0.VnB());
__ Punpklo(p1.VnH(), p0.VnB());
__ Punpklo(p2.VnH(), p1.VnB());
__ Punpklo(p3.VnH(), p2.VnB());
__ Punpklo(p4.VnH(), p3.VnB());
__ Dup(z0.VnB(), 0x55);
__ Index(z1.VnB(), 0, 1);
// Store with all-true and patterned predication, load back, and create a
// reference value for later comparison.
__ Rdvl(x1, 1);
__ Stnt1b(z0.VnB(), p0, SVEMemOperand(x0, x1));
__ Stnt1b(z1.VnB(), p1, SVEMemOperand(x0, 1, SVE_MUL_VL));
__ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1));
__ Sel(z3.VnB(), p1, z1.VnB(), z0.VnB());
// Repeated, with wider elements and different offsets.
__ Rdvl(x1, -1);
__ Lsr(x1, x1, 1);
__ Stnt1h(z0.VnH(), p0, SVEMemOperand(x0, x1, LSL, 1));
__ Stnt1h(z1.VnH(), p2, SVEMemOperand(x0, -1, SVE_MUL_VL));
__ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
__ Sel(z5.VnH(), p2, z1.VnH(), z0.VnH());
__ Rdvl(x1, 7);
__ Lsr(x1, x1, 2);
__ Stnt1w(z0.VnS(), p0, SVEMemOperand(x0, x1, LSL, 2));
__ Stnt1w(z1.VnS(), p3, SVEMemOperand(x0, 7, SVE_MUL_VL));
__ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
__ Sel(z7.VnS(), p3, z1.VnS(), z0.VnS());
__ Rdvl(x1, -8);
__ Lsr(x1, x1, 3);
__ Stnt1d(z0.VnD(), p0, SVEMemOperand(x0, x1, LSL, 3));
__ Stnt1d(z1.VnD(), p4, SVEMemOperand(x0, -8, SVE_MUL_VL));
__ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
__ Sel(z9.VnD(), p4, z1.VnD(), z0.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z2, z3);
ASSERT_EQUAL_SVE(z4, z5);
ASSERT_EQUAL_SVE(z6, z7);
ASSERT_EQUAL_SVE(z8, z9);
}
}
TEST_SVE(sve_ld1rq) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int data_size = (kQRegSizeInBytes + 128) * 2;
uint8_t* data = new uint8_t[data_size];
for (int i = 0; i < data_size; i++) {
data[i] = i & 0xff;
}
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
__ Index(z0.VnB(), 0, 1);
__ Ptrue(p0.VnB());
__ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
__ Pfalse(p1.VnB());
__ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
// Load and broadcast using scalar offsets.
__ Mov(x1, -42);
__ Ld1rqb(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
__ Add(x2, x0, 1);
__ Mov(x1, -21);
__ Punpklo(p2.VnH(), p1.VnB());
__ Ld1rqh(z1.VnH(), p2.Zeroing(), SVEMemOperand(x2, x1, LSL, 1));
__ Add(x2, x2, 1);
__ Mov(x1, -10);
__ Punpklo(p3.VnH(), p2.VnB());
__ Ld1rqw(z2.VnS(), p3.Zeroing(), SVEMemOperand(x2, x1, LSL, 2));
__ Add(x2, x2, 1);
__ Mov(x1, 5);
__ Punpklo(p4.VnH(), p3.VnB());
__ Ld1rqd(z3.VnD(), p4.Zeroing(), SVEMemOperand(x2, x1, LSL, 3));
// Check that all segments match by rotating the vector by one segment,
// eoring, and orring across the vector.
__ Mov(z4, z0);
__ Ext(z4.VnB(), z4.VnB(), z4.VnB(), 16);
__ Eor(z4.VnB(), z4.VnB(), z0.VnB());
__ Orv(b4, p0, z4.VnB());
__ Mov(z5, z1);
__ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
__ Eor(z5.VnB(), z5.VnB(), z1.VnB());
__ Orv(b5, p0, z5.VnB());
__ Orr(z4, z4, z5);
__ Mov(z5, z2);
__ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
__ Eor(z5.VnB(), z5.VnB(), z2.VnB());
__ Orv(b5, p0, z5.VnB());
__ Orr(z4, z4, z5);
__ Mov(z5, z3);
__ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
__ Eor(z5.VnB(), z5.VnB(), z3.VnB());
__ Orv(b5, p0, z5.VnB());
__ Orr(z4, z4, z5);
// Load and broadcast the same values, using immediate offsets.
__ Add(x1, x0, 6);
__ Ld1rqb(z5.VnB(), p1.Zeroing(), SVEMemOperand(x1, -48));
__ Add(x1, x0, -9);
__ Ld1rqh(z6.VnH(), p2.Zeroing(), SVEMemOperand(x1, -32));
__ Add(x1, x0, -70);
__ Ld1rqw(z7.VnS(), p3.Zeroing(), SVEMemOperand(x1, 32));
__ Add(x1, x0, 27);
__ Ld1rqd(z8.VnD(), p4.Zeroing(), SVEMemOperand(x1, 16));
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z0[] = {0x0000000000000000, 0x006c006a00680066};
uint64_t expected_z1[] = {0x000074730000706f, 0x00006c6b00006867};
uint64_t expected_z2[] = {0x0000000075747372, 0x000000006d6c6b6a};
uint64_t expected_z3[] = {0x0000000000000000, 0xc2c1c0bfbebdbcbb};
uint64_t expected_z4[] = {0, 0};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
ASSERT_EQUAL_SVE(z0, z5);
ASSERT_EQUAL_SVE(z1, z6);
ASSERT_EQUAL_SVE(z2, z7);
ASSERT_EQUAL_SVE(z3, z8);
}
}
TEST_SVE(sve_st1_vec_imm) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
START();
// TODO: Use mmap() to request a buffer in the low 4GB, which allows testing
// 32-bit address vectors.
int data_size = kZRegMaxSizeInBytes * 16;
uint8_t* data = new uint8_t[data_size];
// Set the base to 16 bytes from the end of the buffer so we can use negative
// indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size - 16]));
__ Ptrue(p0.VnB());
// Store a vector of index values in reverse order, using
// vector-plus-immediate addressing to begin at byte 15, then storing to
// bytes 14, 13, etc.
__ Index(z1.VnD(), x0, -1);
__ Index(z2.VnD(), 0, 1);
// Iterate in order to store at least 16 bytes. The number of iterations
// depends on VL, eg. VL128 iterates eight times, storing bytes 15 and 14
// on the first iteration, 13 and 12 on the next, etc.
uint64_t dlanes = config->sve_vl_in_bytes() / kDRegSizeInBytes;
for (int i = 15; i >= 0; i -= dlanes * kBRegSizeInBytes) {
__ St1b(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
__ Incd(z2.VnD());
}
// Reload the stored data, and build a reference for comparison. The reference
// is truncated to a Q register, as only the least-significant 128 bits are
// checked.
__ Ldr(q4, MemOperand(x0));
__ Index(z5.VnB(), 15, -1);
__ Mov(q5, q5);
// Repeat for wider elements.
__ Index(z1.VnD(), x0, -2); // Stepping by -2 for H-sized elements.
__ Index(z2.VnD(), 0, 1);
for (int i = 14; i >= 0; i -= dlanes * kHRegSizeInBytes) {
__ St1h(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
__ Incd(z2.VnD());
}
__ Ldr(q6, MemOperand(x0));
__ Index(z7.VnH(), 7, -1);
__ Mov(q7, q7);
__ Index(z1.VnD(), x0, -4); // Stepping by -4 for S-sized elements.
__ Index(z2.VnD(), 0, 1);
for (int i = 12; i >= 0; i -= dlanes * kSRegSizeInBytes) {
__ St1w(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
__ Incd(z2.VnD());
}
__ Ldr(q8, MemOperand(x0));
__ Index(z9.VnS(), 3, -1);
__ Mov(q9, q9);
__ Index(z1.VnD(), x0, -8); // Stepping by -8 for D-sized elements.
__ Index(z2.VnD(), 0, 1);
for (int i = 8; i >= 0; i -= dlanes * kDRegSizeInBytes) {
__ St1d(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
__ Incd(z2.VnD());
}
__ Ldr(q10, MemOperand(x0));
__ Index(z11.VnD(), 1, -1);
__ Mov(q11, q11);
// Test predication by storing even halfwords to memory (using predication)
// at byte-separated addresses. The result should be the same as storing
// even halfwords contiguously to memory.
__ Pfalse(p1.VnB());
__ Zip1(p1.VnD(), p0.VnD(), p1.VnD());
__ Mov(x0, reinterpret_cast<uintptr_t>(data));
__ Index(z1.VnD(), x0, 1);
__ Index(z2.VnD(), 0x1000, 1);
for (int i = 0; i < 16; i += dlanes) {
__ St1h(z2.VnD(), p1, SVEMemOperand(z1.VnD(), i));
__ Incd(z2.VnD());
}
__ Ldr(q2, MemOperand(x0));
__ Index(z3.VnH(), 0x1000, 2);
__ Mov(q3, q3);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z3, z2);
ASSERT_EQUAL_SVE(z5, z4);
ASSERT_EQUAL_SVE(z7, z6);
ASSERT_EQUAL_SVE(z9, z8);
ASSERT_EQUAL_SVE(z11, z10);
}
}
template <typename T>
static void sve_st1_scalar_plus_vector_helper(Test* config,
int esize_in_bits,
T mod,
bool is_scaled) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int vl = config->sve_vl_in_bytes();
int data_size = vl * 160;
uint8_t* data = new uint8_t[data_size];
memset(data, 0, data_size);
int vl_per_esize = vl / (esize_in_bits / kBitsPerByte);
ZRegister zn_b = z0.WithLaneSize(esize_in_bits);
ZRegister zn_h = z1.WithLaneSize(esize_in_bits);
ZRegister zn_s = z2.WithLaneSize(esize_in_bits);
ZRegister zn_d = z3.WithLaneSize(esize_in_bits);
ZRegister zn_ld_b = z10.WithLaneSize(esize_in_bits);
ZRegister zn_ld_h = z11.WithLaneSize(esize_in_bits);
ZRegister zn_ld_s = z12.WithLaneSize(esize_in_bits);
ZRegister zn_ld_d = z13.WithLaneSize(esize_in_bits);
ZRegister offsets = z31.WithLaneSize(esize_in_bits);
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
__ Ptrue(p6.WithLaneSize(esize_in_bits));
__ Pfalse(p7.WithLaneSize(esize_in_bits));
__ Zip1(p0.WithLaneSize(esize_in_bits),
p6.WithLaneSize(esize_in_bits),
p7.WithLaneSize(esize_in_bits));
__ Zip1(p1.WithLaneSize(esize_in_bits),
p7.WithLaneSize(esize_in_bits),
p6.WithLaneSize(esize_in_bits));
// `st1b` doesn't have the scaled-offset forms.
if (is_scaled == false) {
// Simply stepping the index by 2 to simulate a scatter memory access.
__ Index(offsets, 1, 2);
__ St1b(offsets, p0, SVEMemOperand(x0, offsets, mod));
__ Ld1b(zn_ld_b, p0.Zeroing(), SVEMemOperand(x0, offsets, mod));
__ Dup(zn_b, 0);
__ Mov(zn_b, p0.Merging(), offsets);
}
// Store the values to isolated range different with other stores.
int scale = is_scaled ? 1 : 0;
__ Add(x1, x0, vl_per_esize * 4);
__ Index(offsets, 6, 4);
__ St1h(offsets, p0, SVEMemOperand(x1, offsets, mod, scale));
__ Ld1h(zn_ld_h, p0.Zeroing(), SVEMemOperand(x1, offsets, mod, scale));
__ Dup(zn_h, 0);
__ Mov(zn_h, p0.Merging(), offsets);
scale = is_scaled ? 2 : 0;
__ Add(x2, x0, UINT64_MAX + (vl_per_esize * -8) + 1);
__ Index(offsets, 64, 8);
if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
(static_cast<int>(mod) == SXTW)) {
// Testing negative offsets.
__ Neg(offsets, p6.Merging(), offsets);
}
__ St1w(offsets, p1, SVEMemOperand(x2, offsets, mod, scale));
__ Ld1w(zn_ld_s, p1.Zeroing(), SVEMemOperand(x2, offsets, mod, scale));
__ Dup(zn_s, 0);
__ Mov(zn_s, p1.Merging(), offsets);
if (esize_in_bits == kDRegSize) {
// Test st1w by comparing the 32-bit value loaded correspondingly with the
// 32-bit value stored.
__ Lsl(zn_s, zn_s, kSRegSize);
__ Lsr(zn_s, zn_s, kSRegSize);
}
// `st1d` doesn't have the S-sized lane forms.
if (esize_in_bits == kDRegSize) {
scale = is_scaled ? 3 : 0;
__ Add(x3, x0, UINT64_MAX + (vl_per_esize * -16) + 1);
__ Index(offsets, 128, 16);
if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
(static_cast<int>(mod) == SXTW)) {
__ Neg(offsets, p6.Merging(), offsets);
}
__ St1d(offsets, p1, SVEMemOperand(x3, offsets, mod, scale));
__ Ld1d(zn_ld_d, p1.Zeroing(), SVEMemOperand(x3, offsets, mod, scale));
__ Dup(zn_d, 0);
__ Mov(zn_d, p1.Merging(), offsets);
}
END();
if (CAN_RUN()) {
RUN();
if (scale == false) {
ASSERT_EQUAL_SVE(zn_ld_b, zn_b);
}
ASSERT_EQUAL_SVE(zn_ld_h, zn_h);
ASSERT_EQUAL_SVE(zn_ld_s, zn_s);
if (esize_in_bits == kDRegSize) {
ASSERT_EQUAL_SVE(zn_ld_d, zn_d);
}
}
delete[] data;
}
TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled) {
sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, false);
sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, false);
}
TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled) {
sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, true);
sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, true);
}
TEST_SVE(sve_st1_sca_vec_32_unscaled) {
sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, false);
sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, false);
}
TEST_SVE(sve_st1_sca_vec_32_scaled) {
sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, true);
sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, true);
}
TEST_SVE(sve_st1_sca_vec_64_scaled) {
sve_st1_scalar_plus_vector_helper(config, kDRegSize, LSL, true);
}
TEST_SVE(sve_st1_sca_vec_64_unscaled) {
sve_st1_scalar_plus_vector_helper(config, kDRegSize, NO_SHIFT, false);
}
typedef void (MacroAssembler::*IntWideImmFn)(const ZRegister& zd,
const ZRegister& zn,
const IntegerOperand imm);
template <typename F, typename Td, typename Tn>
static void IntWideImmHelper(Test* config,
F macro,
unsigned lane_size_in_bits,
const Tn& zn_inputs,
IntegerOperand imm,
const Td& zd_expected) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
ZRegister zd1 = z0.WithLaneSize(lane_size_in_bits);
InsrHelper(&masm, zd1, zn_inputs);
// Also test with a different zn, to test the movprfx case.
ZRegister zn = z1.WithLaneSize(lane_size_in_bits);
InsrHelper(&masm, zn, zn_inputs);
ZRegister zd2 = z2.WithLaneSize(lane_size_in_bits);
ZRegister zn_copy = z3.WithSameLaneSizeAs(zn);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(zn_copy, zn);
{
UseScratchRegisterScope temps(&masm);
// The MacroAssembler needs a P scratch register for some of these macros,
// and it doesn't have one by default.
temps.Include(p3);
(masm.*macro)(zd1, zd1, imm);
(masm.*macro)(zd2, zn, imm);
}
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zd_expected, zd1);
// Check the result from `instr` with movprfx is the same as
// the immediate version.
ASSERT_EQUAL_SVE(zd_expected, zd2);
ASSERT_EQUAL_SVE(zn_copy, zn);
}
}
TEST_SVE(sve_int_wide_imm_unpredicated_smax) {
int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
int64_t in_d[] = {1, 10, 10000, 1000000};
IntWideImmFn fn = &MacroAssembler::Smax;
int exp_b_1[] = {0, -1, 127, -1, 126, 1, -1, 55};
int exp_h_1[] = {127, 127, 127, 127, INT16_MAX, 127, 127, 5555};
int exp_s_1[] = {0, -128, 127, -128, INT32_MAX, 1, -1, 555555};
int64_t exp_d_1[] = {99, 99, 10000, 1000000};
IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
int exp_h_2[] = {0, -128, 127, -255, INT16_MAX, 1, -1, 5555};
int exp_s_2[] = {2048, 2048, 2048, 2048, INT32_MAX, 2048, 2048, 555555};
int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
// The immediate is in the range [-128, 127], but the macro is able to
// synthesise unencodable immediates.
// B-sized lanes cannot take an immediate out of the range [-128, 127].
IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
}
TEST_SVE(sve_int_wide_imm_unpredicated_smin) {
int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
int64_t in_d[] = {1, 10, 10000, 1000000};
IntWideImmFn fn = &MacroAssembler::Smin;
int exp_b_1[] = {-1, -128, -1, -127, -1, -1, -1, -1};
int exp_h_1[] = {0, -128, 127, INT16_MIN, 127, 1, -1, 127};
int exp_s_1[] = {-128, -128, -128, INT32_MIN, -128, -128, -128, -128};
int64_t exp_d_1[] = {1, 10, 99, 99};
IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
int exp_h_2[] = {-255, -255, -255, INT16_MIN, -255, -255, -255, -255};
int exp_s_2[] = {0, -128, 127, INT32_MIN, 2048, 1, -1, 2048};
int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
// The immediate is in the range [-128, 127], but the macro is able to
// synthesise unencodable immediates.
// B-sized lanes cannot take an immediate out of the range [-128, 127].
IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
}
TEST_SVE(sve_int_wide_imm_unpredicated_umax) {
int in_b[] = {0, 255, 127, 0x80, 1, 55};
int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
int64_t in_d[] = {1, 10, 10000, 1000000};
IntWideImmFn fn = &MacroAssembler::Umax;
int exp_b_1[] = {17, 255, 127, 0x80, 17, 55};
int exp_h_1[] = {127, 255, 127, INT16_MAX, 127, 5555};
int exp_s_1[] = {255, 255, 255, INT32_MAX, 255, 555555};
int64_t exp_d_1[] = {99, 99, 10000, 1000000};
IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, 0xff, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
int exp_h_2[] = {511, 511, 511, INT16_MAX, 511, 5555};
int exp_s_2[] = {2048, 2048, 2048, INT32_MAX, 2048, 555555};
int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
// The immediate is in the range [0, 255], but the macro is able to
// synthesise unencodable immediates.
// B-sized lanes cannot take an immediate out of the range [0, 255].
IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
}
TEST_SVE(sve_int_wide_imm_unpredicated_umin) {
int in_b[] = {0, 255, 127, 0x80, 1, 55};
int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
int64_t in_d[] = {1, 10, 10000, 1000000};
IntWideImmFn fn = &MacroAssembler::Umin;
int exp_b_1[] = {0, 17, 17, 17, 1, 17};
int exp_h_1[] = {0, 127, 127, 127, 1, 127};
int exp_s_1[] = {0, 255, 127, 255, 1, 255};
int64_t exp_d_1[] = {1, 10, 99, 99};
IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, 255, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
int exp_h_2[] = {0, 255, 127, 511, 1, 511};
int exp_s_2[] = {0, 255, 127, 2048, 1, 2048};
int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
// The immediate is in the range [0, 255], but the macro is able to
// synthesise unencodable immediates.
// B-sized lanes cannot take an immediate out of the range [0, 255].
IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
}
TEST_SVE(sve_int_wide_imm_unpredicated_mul) {
int in_b[] = {11, -1, 7, -3};
int in_h[] = {111, -1, 17, -123};
int in_s[] = {11111, -1, 117, -12345};
int64_t in_d[] = {0x7fffffff, 0x80000000};
IntWideImmFn fn = &MacroAssembler::Mul;
int exp_b_1[] = {66, -6, 42, -18};
int exp_h_1[] = {-14208, 128, -2176, 15744};
int exp_s_1[] = {11111 * 127, -127, 117 * 127, -12345 * 127};
int64_t exp_d_1[] = {0xfffffffe, 0x100000000};
IntWideImmHelper(config, fn, kBRegSize, in_b, 6, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, -128, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 2, exp_d_1);
int exp_h_2[] = {-28305, 255, -4335, 31365};
int exp_s_2[] = {22755328, -2048, 239616, -25282560};
int64_t exp_d_2[] = {0x00000063ffffff38, 0x0000006400000000};
// The immediate is in the range [-128, 127], but the macro is able to
// synthesise unencodable immediates.
// B-sized lanes cannot take an immediate out of the range [0, 255].
IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, 200, exp_d_2);
// Integer overflow on multiplication.
unsigned exp_b_3[] = {0x75, 0x81, 0x79, 0x83};
IntWideImmHelper(config, fn, kBRegSize, in_b, 0x7f, exp_b_3);
}
TEST_SVE(sve_int_wide_imm_unpredicated_add) {
unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
IntWideImmFn fn = &MacroAssembler::Add;
unsigned exp_b_1[] = {0x02, 0x00, 0x91, 0x80};
unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
// Encodable with `add` (shift 0).
IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
// Encodable with `add` (shift 8).
// B-sized lanes cannot take a shift of 8.
IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
unsigned exp_s_3[] = {0x80808181, 0x807e7f7f, 0xab29aaaa, 0xf07ff0f0};
// The macro is able to synthesise unencodable immediates.
IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
unsigned exp_b_4[] = {0x61, 0x5f, 0xf0, 0xdf};
unsigned exp_h_4[] = {0x6181, 0x5f7f, 0xf010, 0x8aaa};
unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
uint64_t exp_d_4[] = {0x8000000180018180, 0x7fffffff7fff7f7e};
// Negative immediates use `sub`.
IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
}
TEST_SVE(sve_int_wide_imm_unpredicated_sqadd) {
unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
IntWideImmFn fn = &MacroAssembler::Sqadd;
unsigned exp_b_1[] = {0x02, 0x7f, 0x7f, 0x7f};
unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
// Encodable with `sqadd` (shift 0).
// Note that encodable immediates are unsigned, even for signed saturation.
IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
unsigned exp_h_2[] = {0x9181, 0x7fff, 0x2010, 0xbaaa};
unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
// Encodable with `sqadd` (shift 8).
// B-sized lanes cannot take a shift of 8.
IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
}
TEST_SVE(sve_int_wide_imm_unpredicated_uqadd) {
unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
IntWideImmFn fn = &MacroAssembler::Uqadd;
unsigned exp_b_1[] = {0xff, 0xff, 0x91, 0xff};
unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
// Encodable with `uqadd` (shift 0).
IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
// Encodable with `uqadd` (shift 8).
// B-sized lanes cannot take a shift of 8.
IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
}
TEST_SVE(sve_int_wide_imm_unpredicated_sub) {
unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
IntWideImmFn fn = &MacroAssembler::Sub;
unsigned exp_b_1[] = {0x00, 0xfe, 0x8f, 0x7e};
unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
// Encodable with `sub` (shift 0).
IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
// Encodable with `sub` (shift 8).
// B-sized lanes cannot take a shift of 8.
IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
unsigned exp_s_3[] = {0x7f828181, 0x7f807f7f, 0xaa2baaaa, 0xef81f0f0};
// The macro is able to synthesise unencodable immediates.
IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
unsigned exp_b_4[] = {0xa1, 0x9f, 0x30, 0x1f};
unsigned exp_h_4[] = {0xa181, 0x9f7f, 0x3010, 0xcaaa};
unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
uint64_t exp_d_4[] = {0x8000000180018182, 0x7fffffff7fff7f80};
// Negative immediates use `add`.
IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
}
TEST_SVE(sve_int_wide_imm_unpredicated_sqsub) {
unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
IntWideImmFn fn = &MacroAssembler::Sqsub;
unsigned exp_b_1[] = {0x80, 0xfe, 0x8f, 0x80};
unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
// Encodable with `sqsub` (shift 0).
// Note that encodable immediates are unsigned, even for signed saturation.
IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
unsigned exp_h_2[] = {0x8000, 0x6f7f, 0x0010, 0x9aaa};
unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
// Encodable with `sqsub` (shift 8).
// B-sized lanes cannot take a shift of 8.
IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
}
TEST_SVE(sve_int_wide_imm_unpredicated_uqsub) {
unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
IntWideImmFn fn = &MacroAssembler::Uqsub;
unsigned exp_b_1[] = {0x00, 0x00, 0x00, 0x7e};
unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
// Encodable with `uqsub` (shift 0).
IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
// Encodable with `uqsub` (shift 8).
// B-sized lanes cannot take a shift of 8.
IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
}
TEST_SVE(sve_int_wide_imm_unpredicated_subr) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Encodable with `subr` (shift 0).
__ Index(z0.VnD(), 1, 1);
__ Sub(z0.VnD(), 100, z0.VnD());
__ Index(z1.VnS(), 0x7f, 1);
__ Sub(z1.VnS(), 0xf7, z1.VnS());
__ Index(z2.VnH(), 0xaaaa, 0x2222);
__ Sub(z2.VnH(), 0x80, z2.VnH());
__ Index(z3.VnB(), 133, 1);
__ Sub(z3.VnB(), 255, z3.VnB());
// Encodable with `subr` (shift 8).
__ Index(z4.VnD(), 256, -1);
__ Sub(z4.VnD(), 42 * 256, z4.VnD());
__ Index(z5.VnS(), 0x7878, 1);
__ Sub(z5.VnS(), 0x8000, z5.VnS());
__ Index(z6.VnH(), 0x30f0, -1);
__ Sub(z6.VnH(), 0x7f00, z6.VnH());
// B-sized lanes cannot take a shift of 8.
// Select with movprfx.
__ Index(z31.VnD(), 256, 4001);
__ Sub(z7.VnD(), 42 * 256, z31.VnD());
// Out of immediate encodable range of `sub`.
__ Index(z30.VnS(), 0x11223344, 1);
__ Sub(z8.VnS(), 0x88776655, z30.VnS());
END();
if (CAN_RUN()) {
RUN();
int expected_z0[] = {87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
int expected_z1[] = {0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78};
ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
int expected_z2[] = {0xab2c, 0xcd4e, 0xef70, 0x1192, 0x33b4, 0x55d6};
ASSERT_EQUAL_SVE(expected_z2, z2.VnH());
int expected_z3[] = {0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a};
ASSERT_EQUAL_SVE(expected_z3, z3.VnB());
int expected_z4[] = {10502, 10501, 10500, 10499, 10498, 10497, 10496};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
int expected_z5[] = {0x0783, 0x0784, 0x0785, 0x0786, 0x0787, 0x0788};
ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
int expected_z6[] = {0x4e15, 0x4e14, 0x4e13, 0x4e12, 0x4e11, 0x4e10};
ASSERT_EQUAL_SVE(expected_z6, z6.VnH());
int expected_z7[] = {-13510, -9509, -5508, -1507, 2494, 6495, 10496};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
int expected_z8[] = {0x7755330e, 0x7755330f, 0x77553310, 0x77553311};
ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
}
}
TEST_SVE(sve_int_wide_imm_unpredicated_fdup) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Immediates which can be encoded in the instructions.
__ Fdup(z0.VnH(), RawbitsToFloat16(0xc500));
__ Fdup(z1.VnS(), Float16(2.0));
__ Fdup(z2.VnD(), Float16(3.875));
__ Fdup(z3.VnH(), 8.0f);
__ Fdup(z4.VnS(), -4.75f);
__ Fdup(z5.VnD(), 0.5f);
__ Fdup(z6.VnH(), 1.0);
__ Fdup(z7.VnS(), 2.125);
__ Fdup(z8.VnD(), -13.0);
// Immediates which cannot be encoded in the instructions.
__ Fdup(z10.VnH(), Float16(0.0));
__ Fdup(z11.VnH(), kFP16PositiveInfinity);
__ Fdup(z12.VnS(), 255.0f);
__ Fdup(z13.VnS(), kFP32NegativeInfinity);
__ Fdup(z14.VnD(), 12.3456);
__ Fdup(z15.VnD(), kFP64PositiveInfinity);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(0xc500, z0.VnH());
ASSERT_EQUAL_SVE(0x40000000, z1.VnS());
ASSERT_EQUAL_SVE(0x400f000000000000, z2.VnD());
ASSERT_EQUAL_SVE(0x4800, z3.VnH());
ASSERT_EQUAL_SVE(FloatToRawbits(-4.75f), z4.VnS());
ASSERT_EQUAL_SVE(DoubleToRawbits(0.5), z5.VnD());
ASSERT_EQUAL_SVE(0x3c00, z6.VnH());
ASSERT_EQUAL_SVE(FloatToRawbits(2.125f), z7.VnS());
ASSERT_EQUAL_SVE(DoubleToRawbits(-13.0), z8.VnD());
ASSERT_EQUAL_SVE(0x0000, z10.VnH());
ASSERT_EQUAL_SVE(Float16ToRawbits(kFP16PositiveInfinity), z11.VnH());
ASSERT_EQUAL_SVE(FloatToRawbits(255.0), z12.VnS());
ASSERT_EQUAL_SVE(FloatToRawbits(kFP32NegativeInfinity), z13.VnS());
ASSERT_EQUAL_SVE(DoubleToRawbits(12.3456), z14.VnD());
ASSERT_EQUAL_SVE(DoubleToRawbits(kFP64PositiveInfinity), z15.VnD());
}
}
TEST_SVE(sve_andv_eorv_orv) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0x8899aabbccddeeff, 0x7777555533331111, 0x123456789abcdef0};
InsrHelper(&masm, z31.VnD(), in);
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 1, 0
// For S lanes: 1, 1, 1, 0, 0
// For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
Initialise(&masm, p0.VnB(), pg_in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z0, z31);
__ Andv(b0, p0, z0.VnB()); // destructive
__ Andv(h1, p0, z31.VnH());
__ Mov(z2, z31);
__ Andv(s2, p0, z2.VnS()); // destructive
__ Andv(d3, p0, z31.VnD());
__ Eorv(b4, p0, z31.VnB());
__ Mov(z5, z31);
__ Eorv(h5, p0, z5.VnH()); // destructive
__ Eorv(s6, p0, z31.VnS());
__ Mov(z7, z31);
__ Eorv(d7, p0, z7.VnD()); // destructive
__ Mov(z8, z31);
__ Orv(b8, p0, z8.VnB()); // destructive
__ Orv(h9, p0, z31.VnH());
__ Mov(z10, z31);
__ Orv(s10, p0, z10.VnS()); // destructive
__ Orv(d11, p0, z31.VnD());
END();
if (CAN_RUN()) {
RUN();
if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
ASSERT_EQUAL_64(0x10, d0);
ASSERT_EQUAL_64(0x1010, d1);
ASSERT_EQUAL_64(0x33331111, d2);
ASSERT_EQUAL_64(0x7777555533331111, d3);
ASSERT_EQUAL_64(0xbf, d4);
ASSERT_EQUAL_64(0xedcb, d5);
ASSERT_EQUAL_64(0x44444444, d6);
ASSERT_EQUAL_64(0x7777555533331111, d7);
ASSERT_EQUAL_64(0xff, d8);
ASSERT_EQUAL_64(0xffff, d9);
ASSERT_EQUAL_64(0x77775555, d10);
ASSERT_EQUAL_64(0x7777555533331111, d11);
} else {
ASSERT_EQUAL_64(0, d0);
ASSERT_EQUAL_64(0x0010, d1);
ASSERT_EQUAL_64(0x00110011, d2);
ASSERT_EQUAL_64(0x0011001100110011, d3);
ASSERT_EQUAL_64(0x62, d4);
ASSERT_EQUAL_64(0x0334, d5);
ASSERT_EQUAL_64(0x8899aabb, d6);
ASSERT_EQUAL_64(0xffeeffeeffeeffee, d7);
ASSERT_EQUAL_64(0xff, d8);
ASSERT_EQUAL_64(0xffff, d9);
ASSERT_EQUAL_64(0xffffffff, d10);
ASSERT_EQUAL_64(0xffffffffffffffff, d11);
}
// Check the upper lanes above the top of the V register are all clear.
for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z8.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z9.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z10.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z11.VnD(), i);
}
}
}
TEST_SVE(sve_saddv_uaddv) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0x8899aabbccddeeff, 0x8182838485868788, 0x0807060504030201};
InsrHelper(&masm, z31.VnD(), in);
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 1, 0
// For S lanes: 1, 1, 1, 0, 0
// For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
Initialise(&masm, p0.VnB(), pg_in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z0, z31);
__ Saddv(b0, p0, z0.VnB()); // destructive
__ Saddv(h1, p0, z31.VnH());
__ Mov(z2, z31);
__ Saddv(s2, p0, z2.VnS()); // destructive
__ Uaddv(b4, p0, z31.VnB());
__ Mov(z5, z31);
__ Uaddv(h5, p0, z5.VnH()); // destructive
__ Uaddv(s6, p0, z31.VnS());
__ Mov(z7, z31);
__ Uaddv(d7, p0, z7.VnD()); // destructive
END();
if (CAN_RUN()) {
RUN();
if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
// Saddv
ASSERT_EQUAL_64(0xfffffffffffffda9, d0);
ASSERT_EQUAL_64(0xfffffffffffe9495, d1);
ASSERT_EQUAL_64(0xffffffff07090b0c, d2);
// Uaddv
ASSERT_EQUAL_64(0x00000000000002a9, d4);
ASSERT_EQUAL_64(0x0000000000019495, d5);
ASSERT_EQUAL_64(0x0000000107090b0c, d6);
ASSERT_EQUAL_64(0x8182838485868788, d7);
} else {
// Saddv
ASSERT_EQUAL_64(0xfffffffffffffd62, d0);
ASSERT_EQUAL_64(0xfffffffffffe8394, d1);
ASSERT_EQUAL_64(0xfffffffed3e6fa0b, d2);
// Uaddv
ASSERT_EQUAL_64(0x0000000000000562, d4);
ASSERT_EQUAL_64(0x0000000000028394, d5);
ASSERT_EQUAL_64(0x00000001d3e6fa0b, d6);
ASSERT_EQUAL_64(0x0a1c2e4052647687, d7);
}
// Check the upper lanes above the top of the V register are all clear.
for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
}
}
}
TEST_SVE(sve_sminv_uminv) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
InsrHelper(&masm, z31.VnD(), in);
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 0, 1
// For S lanes: 1, 1, 0, 0, 1
// For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
Initialise(&masm, p0.VnB(), pg_in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z0, z31);
__ Sminv(b0, p0, z0.VnB()); // destructive
__ Sminv(h1, p0, z31.VnH());
__ Mov(z2, z31);
__ Sminv(s2, p0, z2.VnS()); // destructive
__ Sminv(d3, p0, z31.VnD());
__ Uminv(b4, p0, z31.VnB());
__ Mov(z5, z31);
__ Uminv(h5, p0, z5.VnH()); // destructive
__ Uminv(s6, p0, z31.VnS());
__ Mov(z7, z31);
__ Uminv(d7, p0, z7.VnD()); // destructive
END();
if (CAN_RUN()) {
RUN();
if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
// Sminv
ASSERT_EQUAL_64(0xaa, d0);
ASSERT_EQUAL_64(0xaabb, d1);
ASSERT_EQUAL_64(0xaabbfc00, d2);
ASSERT_EQUAL_64(0x00112233aabbfc00, d3); // The smaller lane is inactive.
// Uminv
ASSERT_EQUAL_64(0, d4);
ASSERT_EQUAL_64(0x2233, d5);
ASSERT_EQUAL_64(0x112233, d6);
ASSERT_EQUAL_64(0x00112233aabbfc00, d7); // The smaller lane is inactive.
} else {
// Sminv
ASSERT_EQUAL_64(0xaa, d0);
ASSERT_EQUAL_64(0xaaaa, d1);
ASSERT_EQUAL_64(0xaaaaaaaa, d2);
ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d3);
// Uminv
ASSERT_EQUAL_64(0, d4);
ASSERT_EQUAL_64(0x2233, d5);
ASSERT_EQUAL_64(0x112233, d6);
ASSERT_EQUAL_64(0x00112233aabbfc00, d7);
}
// Check the upper lanes above the top of the V register are all clear.
for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
}
}
}
TEST_SVE(sve_smaxv_umaxv) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
InsrHelper(&masm, z31.VnD(), in);
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 0, 1
// For S lanes: 1, 1, 0, 0, 1
// For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
Initialise(&masm, p0.VnB(), pg_in);
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z0, z31);
__ Smaxv(b0, p0, z0.VnB()); // destructive
__ Smaxv(h1, p0, z31.VnH());
__ Mov(z2, z31);
__ Smaxv(s2, p0, z2.VnS()); // destructive
__ Smaxv(d3, p0, z31.VnD());
__ Umaxv(b4, p0, z31.VnB());
__ Mov(z5, z31);
__ Umaxv(h5, p0, z5.VnH()); // destructive
__ Umaxv(s6, p0, z31.VnS());
__ Mov(z7, z31);
__ Umaxv(d7, p0, z7.VnD()); // destructive
END();
if (CAN_RUN()) {
RUN();
if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
// Smaxv
ASSERT_EQUAL_64(0x33, d0);
ASSERT_EQUAL_64(0x44aa, d1);
ASSERT_EQUAL_64(0x112233, d2);
ASSERT_EQUAL_64(0x112233aabbfc00, d3);
// Umaxv
ASSERT_EQUAL_64(0xfe, d4);
ASSERT_EQUAL_64(0xfc00, d5);
ASSERT_EQUAL_64(0xaabbfc00, d6);
ASSERT_EQUAL_64(0x112233aabbfc00, d7);
} else {
// Smaxv
ASSERT_EQUAL_64(0x33, d0);
ASSERT_EQUAL_64(0x44aa, d1);
ASSERT_EQUAL_64(0x112233, d2);
ASSERT_EQUAL_64(0x00112233aabbfc00, d3);
// Umaxv
ASSERT_EQUAL_64(0xfe, d4);
ASSERT_EQUAL_64(0xfc00, d5);
ASSERT_EQUAL_64(0xaabbfc00, d6);
ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d7);
}
// Check the upper lanes above the top of the V register are all clear.
for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
}
}
}
template <typename T, size_t M, size_t N>
static void SdotUdotHelper(Test* config,
unsigned lane_size_in_bits,
const T (&zd_inputs)[M],
const T (&za_inputs)[M],
const T (&zn_inputs)[N],
const T (&zm_inputs)[N],
const T (&zd_expected)[M],
const T (&zdnm_expected)[M],
bool is_signed,
int index = -1) {
VIXL_STATIC_ASSERT(N == (M * 4));
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
auto dot_fn = [&](const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
bool is_signed_fn,
int index_fn) {
if (is_signed_fn) {
if (index_fn < 0) {
__ Sdot(zd, za, zn, zm);
} else {
__ Sdot(zd, za, zn, zm, index_fn);
}
} else {
if (index_fn < 0) {
__ Udot(zd, za, zn, zm);
} else {
__ Udot(zd, za, zn, zm, index_fn);
}
}
};
ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
ZRegister za = z1.WithLaneSize(lane_size_in_bits);
ZRegister zn = z2.WithLaneSize(lane_size_in_bits / 4);
ZRegister zm = z3.WithLaneSize(lane_size_in_bits / 4);
InsrHelper(&masm, zd, zd_inputs);
InsrHelper(&masm, za, za_inputs);
InsrHelper(&masm, zn, zn_inputs);
InsrHelper(&masm, zm, zm_inputs);
// The Dot macro handles arbitrarily-aliased registers in the argument list.
ZRegister dm_result = z4.WithLaneSize(lane_size_in_bits);
ZRegister dnm_result = z5.WithLaneSize(lane_size_in_bits);
ZRegister da_result = z6.WithLaneSize(lane_size_in_bits);
ZRegister dn_result = z7.WithLaneSize(lane_size_in_bits);
ZRegister d_result = z8.WithLaneSize(lane_size_in_bits);
__ Mov(da_result, za);
// zda = zda + (zn . zm)
dot_fn(da_result, da_result, zn, zm, is_signed, index);
__ Mov(dn_result, zn.WithSameLaneSizeAs(dn_result));
// zdn = za + (zdn . zm)
dot_fn(dn_result, za, dn_result.WithSameLaneSizeAs(zn), zm, is_signed, index);
__ Mov(dm_result, zm.WithSameLaneSizeAs(dm_result));
// zdm = za + (zn . zdm)
dot_fn(dm_result, za, zn, dm_result.WithSameLaneSizeAs(zm), is_signed, index);
__ Mov(d_result, zd);
// zd = za + (zn . zm)
dot_fn(d_result, za, zn, zm, is_signed, index);
__ Mov(dnm_result, zn.WithSameLaneSizeAs(dnm_result));
// zdnm = za + (zdmn . zdnm)
dot_fn(dnm_result,
za,
dnm_result.WithSameLaneSizeAs(zn),
dnm_result.WithSameLaneSizeAs(zm),
is_signed,
index);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits / 4));
ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits / 4));
ASSERT_EQUAL_SVE(zd_expected, da_result);
ASSERT_EQUAL_SVE(zd_expected, dn_result);
ASSERT_EQUAL_SVE(zd_expected, dm_result);
ASSERT_EQUAL_SVE(zd_expected, d_result);
ASSERT_EQUAL_SVE(zdnm_expected, dnm_result);
}
}
TEST_SVE(sve_sdot) {
int64_t zd_inputs[] = {0x33, 0xee, 0xff};
int64_t za_inputs[] = {INT32_MAX, -3, 2};
int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
// zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
int64_t zd_expected_s[] = {-2147418113, -183, 133}; // 0x8000ffff
int64_t zd_expected_d[] = {2147549183, -183, 133}; // 0x8000ffff
// zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
int64_t zdnm_expected_s[] = {-2147418113, 980, 572};
int64_t zdnm_expected_d[] = {2147549183, 980, 572};
SdotUdotHelper(config,
kSRegSize,
zd_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_s,
zdnm_expected_s,
true);
SdotUdotHelper(config,
kDRegSize,
zd_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_d,
zdnm_expected_d,
true);
}
TEST_SVE(sve_udot) {
int64_t zd_inputs[] = {0x33, 0xee, 0xff};
int64_t za_inputs[] = {INT32_MAX, -3, 2};
int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
// zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
int64_t zd_expected_s[] = {0x8000ffff, 0x00001749, 0x0000f085};
int64_t zd_expected_d[] = {0x000000047c00ffff,
0x000000000017ff49,
0x00000000fff00085};
// zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
int64_t zdnm_expected_s[] = {0x8000ffff, 0x000101d4, 0x0001d03c};
int64_t zdnm_expected_d[] = {0x000000047c00ffff,
0x00000000fffe03d4,
0x00000001ffce023c};
SdotUdotHelper(config,
kSRegSize,
zd_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_s,
zdnm_expected_s,
false);
SdotUdotHelper(config,
kDRegSize,
zd_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_d,
zdnm_expected_d,
false);
}
TEST_SVE(sve_sdot_indexed_s) {
int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
int64_t za_inputs[] = {0, 1, 2, 3};
int64_t zn_inputs[] =
{-1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4};
int64_t zm_inputs[] =
{127, 127, 127, 127, -128, -128, -128, -128, -1, -1, -1, -1, 0, 0, 0, 0};
constexpr int s = kQRegSize / kSRegSize;
// zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
int64_t zd_expected_s[][s] = {{0, 1, 2, 3}, // Generated from zm[0]
{4, 9, 14, 19},
{512, 1025, 1538, 2051},
{-508, -1015, -1522, -2029}};
// zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
{12, 25, 38, 51},
{8, 17, 26, 35},
{4, 9, 14, 19}};
for (unsigned i = 0; i < s; i++) {
SdotUdotHelper(config,
kSRegSize,
zd_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_s[i],
zdnm_expected_s[i],
true,
i);
}
}
TEST_SVE(sve_sdot_indexed_d) {
int64_t zd_inputs[] = {0xff, 0xff};
int64_t za_inputs[] = {0, 1};
int64_t zn_inputs[] = {-1, -1, -1, -1, -1, -1, -1, -1};
int64_t zm_inputs[] = {-128, -128, -128, -128, 127, 127, 127, 127};
constexpr int d = kQRegSize / kDRegSize;
// zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
int64_t zd_expected_d[][d] = {{-508, -507}, // Generated from zm[0]
{512, 513}};
// zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
for (unsigned i = 0; i < d; i++) {
SdotUdotHelper(config,
kDRegSize,
zd_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_d[i],
zdnm_expected_d[i],
true,
i);
}
}
TEST_SVE(sve_udot_indexed_s) {
int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
int64_t za_inputs[] = {0, 1, 2, 3};
int64_t zn_inputs[] = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4};
int64_t zm_inputs[] =
{127, 127, 127, 127, 255, 255, 255, 255, 1, 1, 1, 1, 0, 0, 0, 0};
constexpr int s = kQRegSize / kSRegSize;
// zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
int64_t zd_expected_s[][s] = {{0, 1, 2, 3},
{4, 9, 14, 19},
{1020, 2041, 3062, 4083},
{508, 1017, 1526, 2035}};
// zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
{12, 25, 38, 51},
{8, 17, 26, 35},
{4, 9, 14, 19}};
for (unsigned i = 0; i < s; i++) {
SdotUdotHelper(config,
kSRegSize,
zd_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_s[i],
zdnm_expected_s[i],
false,
i);
}
}
TEST_SVE(sve_udot_indexed_d) {
int64_t zd_inputs[] = {0xff, 0xff};
int64_t za_inputs[] = {0, 1};
int64_t zn_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1};
int64_t zm_inputs[] = {255, 255, 255, 255, 127, 127, 127, 127};
constexpr int d = kQRegSize / kDRegSize;
// zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
int64_t zd_expected_d[][d] = {{508, 509}, {1020, 1021}};
// zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
for (unsigned i = 0; i < d; i++) {
SdotUdotHelper(config,
kDRegSize,
zd_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_d[i],
zdnm_expected_d[i],
false,
i);
}
}
static void IntSegmentPatternHelper(MacroAssembler* masm,
const ZRegister& dst,
const ZRegister& src) {
VIXL_ASSERT(AreSameLaneSize(dst, src));
UseScratchRegisterScope temps(masm);
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
masm->Index(ztmp, 0, 1);
masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
masm->Add(dst, src, ztmp);
}
TEST_SVE(sve_sdot_udot_indexed_s) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
const int multiplier = 2;
__ Dup(z9.VnS(), multiplier);
__ Ptrue(p0.VnB());
__ Index(z29.VnS(), 4, 1);
// z29 = [... 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]
__ And(z29.VnS(), z29.VnS(), 3);
// p7 = [... 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
__ Cmple(p7.VnS(), p0.Zeroing(), z29.VnS(), 0);
// p6 = [... 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
__ Cmple(p6.VnS(), p0.Zeroing(), z29.VnS(), 1);
// p5 = [... 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]
__ Cmple(p5.VnS(), p0.Zeroing(), z29.VnS(), 2);
__ Index(z28.VnB(), 1, 1);
__ Dup(z27.VnS(), z28.VnS(), 0);
// z27 = [... 3, 2, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1]
IntSegmentPatternHelper(&masm, z27.VnB(), z27.VnB());
// z27 = [... 6, 4, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2]
__ Mul(z27.VnS(), p7.Merging(), z27.VnS(), z9.VnS());
// z27 = [... 12, 8, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4]
__ Mul(z27.VnS(), p6.Merging(), z27.VnS(), z9.VnS());
// 2nd segment | 1st segment |
// v v
// z27 = [... 24, 16, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4, 32, 24, 16, 8]
__ Mul(z27.VnS(), p5.Merging(), z27.VnS(), z9.VnS());
__ Dup(z0.VnS(), 0);
__ Dup(z1.VnS(), 0);
__ Dup(z2.VnS(), 0);
__ Dup(z3.VnS(), 0);
__ Dup(z4.VnS(), 0);
__ Dup(z5.VnS(), 0);
// Skip the lanes starting from the 129th lane since the value of these lanes
// are overflow after the number sequence creation by `index`.
__ Cmpls(p3.VnB(), p0.Zeroing(), z28.VnB(), 128);
__ Mov(z0.VnB(), p3.Merging(), z27.VnB());
__ Mov(z1.VnB(), p3.Merging(), z28.VnB());
__ Dup(z2.VnS(), 0);
__ Dup(z3.VnS(), 0);
__ Dup(z4.VnS(), 0);
__ Dup(z5.VnS(), 0);
__ Udot(z2.VnS(), z2.VnS(), z1.VnB(), z0.VnB(), 0);
__ Udot(z3.VnS(), z3.VnS(), z1.VnB(), z0.VnB(), 1);
__ Mul(z3.VnS(), z3.VnS(), 2);
__ Udot(z4.VnS(), z4.VnS(), z1.VnB(), z0.VnB(), 2);
__ Mul(z4.VnS(), z4.VnS(), 4);
__ Udot(z5.VnS(), z5.VnS(), z1.VnB(), z0.VnB(), 3);
__ Mul(z5.VnS(), z5.VnS(), 8);
__ Dup(z7.VnS(), 0);
__ Dup(z8.VnS(), 0);
__ Dup(z9.VnS(), 0);
__ Dup(z10.VnS(), 0);
// Negate the all positive vector for testing signed dot.
__ Neg(z6.VnB(), p0.Merging(), z0.VnB());
__ Sdot(z7.VnS(), z7.VnS(), z1.VnB(), z6.VnB(), 0);
__ Sdot(z8.VnS(), z8.VnS(), z1.VnB(), z6.VnB(), 1);
__ Mul(z8.VnS(), z8.VnS(), 2);
__ Sdot(z9.VnS(), z9.VnS(), z1.VnB(), z6.VnB(), 2);
__ Mul(z9.VnS(), z9.VnS(), 4);
__ Sdot(z10.VnS(), z10.VnS(), z1.VnB(), z6.VnB(), 3);
__ Mul(z10.VnS(), z10.VnS(), 8);
END();
if (CAN_RUN()) {
RUN();
// Only compare the first 128-bit segment of destination register, use
// another result from generated instructions to check the remaining part.
// s_lane[0] = (1 * 8) + (2 * 16) + (3 * 24) + (4 * 32) = 240
// ...
// s_lane[3] = (13 * 8) + (14 * 16) + (15 * 24) + (16 * 32) = 1200
int udot_expected[] = {1200, 880, 560, 240};
ASSERT_EQUAL_SVE(udot_expected, z2.VnS());
ASSERT_EQUAL_SVE(z2.VnS(), z3.VnS());
ASSERT_EQUAL_SVE(z2.VnS(), z4.VnS());
ASSERT_EQUAL_SVE(z2.VnS(), z5.VnS());
int sdot_expected[] = {-1200, -880, -560, -240};
ASSERT_EQUAL_SVE(sdot_expected, z7.VnS());
ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
ASSERT_EQUAL_SVE(z7.VnS(), z9.VnS());
ASSERT_EQUAL_SVE(z7.VnS(), z10.VnS());
}
}
TEST_SVE(sve_sdot_udot_indexed_d) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
const int multiplier = 2;
__ Dup(z9.VnD(), multiplier);
__ Ptrue(p0.VnD());
__ Pfalse(p1.VnD());
// p2 = [..., 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
__ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
__ Index(z1.VnH(), 1, 1);
__ Dup(z0.VnD(), z1.VnD(), 0);
// z0 = [... 5, 4, 3, 2, 5, 4, 3, 2, 4, 3, 2, 1, 4, 3, 2, 1]
IntSegmentPatternHelper(&masm, z0.VnH(), z0.VnH());
// 2nd segment | 1st segment |
// v v
// z0 = [... 5, 4, 3, 2, 10, 8, 6, 4, 4, 3, 2, 1, 8, 6, 4, 2]
__ Mul(z0.VnD(), p2.Merging(), z0.VnD(), z9.VnD());
__ Dup(z3.VnD(), 0);
__ Dup(z4.VnD(), 0);
__ Udot(z3.VnD(), z3.VnD(), z1.VnH(), z0.VnH(), 0);
__ Udot(z4.VnD(), z4.VnD(), z1.VnH(), z0.VnH(), 1);
__ Mul(z4.VnD(), z4.VnD(), multiplier);
__ Dup(z12.VnD(), 0);
__ Dup(z13.VnD(), 0);
__ Ptrue(p4.VnH());
__ Neg(z10.VnH(), p4.Merging(), z0.VnH());
__ Sdot(z12.VnD(), z12.VnD(), z1.VnH(), z10.VnH(), 0);
__ Sdot(z13.VnD(), z13.VnD(), z1.VnH(), z10.VnH(), 1);
__ Mul(z13.VnD(), z13.VnD(), multiplier);
END();
if (CAN_RUN()) {
RUN();
// Only compare the first 128-bit segment of destination register, use
// another result from generated instructions to check the remaining part.
// d_lane[0] = (1 * 2) + (2 * 4) + (3 * 6) + (4 * 8) = 60
// d_lane[1] = (5 * 2) + (6 * 4) + (7 * 6) + (8 * 8) = 140
uint64_t udot_expected[] = {416, 304, 140, 60};
ASSERT_EQUAL_SVE(udot_expected, z3.VnD());
ASSERT_EQUAL_SVE(z3.VnD(), z4.VnD());
int64_t sdot_expected[] = {-416, -304, -140, -60};
ASSERT_EQUAL_SVE(sdot_expected, z12.VnD());
ASSERT_EQUAL_SVE(z12.VnD(), z13.VnD());
}
}
template <typename T, size_t N>
static void FPToRawbitsWithSize(const T (&inputs)[N],
uint64_t* outputs,
unsigned size_in_bits) {
for (size_t i = 0; i < N; i++) {
outputs[i] = vixl::FPToRawbitsWithSize(size_in_bits, inputs[i]);
}
}
template <typename Ti, typename Te, size_t N>
static void FPBinArithHelper(Test* config,
ArithFn macro,
int lane_size_in_bits,
const Ti (&zn_inputs)[N],
const Ti (&zm_inputs)[N],
const Te (&zd_expected)[N]) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
ZRegister zd = z29.WithLaneSize(lane_size_in_bits);
ZRegister zn = z30.WithLaneSize(lane_size_in_bits);
ZRegister zm = z31.WithLaneSize(lane_size_in_bits);
uint64_t zn_rawbits[N];
uint64_t zm_rawbits[N];
FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
InsrHelper(&masm, zn, zn_rawbits);
InsrHelper(&masm, zm, zm_rawbits);
(masm.*macro)(zd, zn, zm);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zd_expected, zd);
}
}
TEST_SVE(sve_fp_arithmetic_unpredicated_fadd) {
double zn_inputs[] = {24.0,
5.5,
0.0,
3.875,
2.125,
kFP64PositiveInfinity,
kFP64NegativeInfinity};
double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
ArithFn fn = &MacroAssembler::Fadd;
uint16_t expected_h[] = {Float16ToRawbits(Float16(1048.0)),
Float16ToRawbits(Float16(2053.5)),
Float16ToRawbits(Float16(0.1)),
Float16ToRawbits(Float16(-0.875)),
Float16ToRawbits(Float16(14.465)),
Float16ToRawbits(kFP16PositiveInfinity),
Float16ToRawbits(kFP16NegativeInfinity)};
FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
uint32_t expected_s[] = {FloatToRawbits(1048.0f),
FloatToRawbits(2053.5f),
FloatToRawbits(0.1f),
FloatToRawbits(-0.875f),
FloatToRawbits(14.465f),
FloatToRawbits(kFP32PositiveInfinity),
FloatToRawbits(kFP32NegativeInfinity)};
FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
uint64_t expected_d[] = {DoubleToRawbits(1048.0),
DoubleToRawbits(2053.5),
DoubleToRawbits(0.1),
DoubleToRawbits(-0.875),
DoubleToRawbits(14.465),
DoubleToRawbits(kFP64PositiveInfinity),
DoubleToRawbits(kFP64NegativeInfinity)};
FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
}
TEST_SVE(sve_fp_arithmetic_unpredicated_fsub) {
double zn_inputs[] = {24.0,
5.5,
0.0,
3.875,
2.125,
kFP64PositiveInfinity,
kFP64NegativeInfinity};
double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
ArithFn fn = &MacroAssembler::Fsub;
uint16_t expected_h[] = {Float16ToRawbits(Float16(-1000.0)),
Float16ToRawbits(Float16(-2042.5)),
Float16ToRawbits(Float16(-0.1)),
Float16ToRawbits(Float16(8.625)),
Float16ToRawbits(Float16(-10.215)),
Float16ToRawbits(kFP16PositiveInfinity),
Float16ToRawbits(kFP16NegativeInfinity)};
FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
uint32_t expected_s[] = {FloatToRawbits(-1000.0),
FloatToRawbits(-2042.5),
FloatToRawbits(-0.1),
FloatToRawbits(8.625),
FloatToRawbits(-10.215),
FloatToRawbits(kFP32PositiveInfinity),
FloatToRawbits(kFP32NegativeInfinity)};
FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
uint64_t expected_d[] = {DoubleToRawbits(-1000.0),
DoubleToRawbits(-2042.5),
DoubleToRawbits(-0.1),
DoubleToRawbits(8.625),
DoubleToRawbits(-10.215),
DoubleToRawbits(kFP64PositiveInfinity),
DoubleToRawbits(kFP64NegativeInfinity)};
FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
}
TEST_SVE(sve_fp_arithmetic_unpredicated_fmul) {
double zn_inputs[] = {24.0,
5.5,
0.0,
3.875,
2.125,
kFP64PositiveInfinity,
kFP64NegativeInfinity};
double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
ArithFn fn = &MacroAssembler::Fmul;
uint16_t expected_h[] = {Float16ToRawbits(Float16(24576.0)),
Float16ToRawbits(Float16(11264.0)),
Float16ToRawbits(Float16(0.0)),
Float16ToRawbits(Float16(-18.4)),
Float16ToRawbits(Float16(26.23)),
Float16ToRawbits(kFP16PositiveInfinity),
Float16ToRawbits(kFP16PositiveInfinity)};
FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
uint32_t expected_s[] = {FloatToRawbits(24576.0),
FloatToRawbits(11264.0),
FloatToRawbits(0.0),
FloatToRawbits(-18.40625),
FloatToRawbits(26.2225),
FloatToRawbits(kFP32PositiveInfinity),
FloatToRawbits(kFP32PositiveInfinity)};
FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
uint64_t expected_d[] = {DoubleToRawbits(24576.0),
DoubleToRawbits(11264.0),
DoubleToRawbits(0.0),
DoubleToRawbits(-18.40625),
DoubleToRawbits(26.2225),
DoubleToRawbits(kFP64PositiveInfinity),
DoubleToRawbits(kFP64PositiveInfinity)};
FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
}
typedef void (MacroAssembler::*FPArithPredicatedFn)(
const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option);
typedef void (MacroAssembler::*FPArithPredicatedNoNaNOptFn)(
const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn,
const ZRegister& zm);
template <typename Ti, typename Te, size_t N>
static void FPBinArithHelper(
Test* config,
FPArithPredicatedFn macro,
FPArithPredicatedNoNaNOptFn macro_nonan,
unsigned lane_size_in_bits,
const Ti (&zd_inputs)[N],
const int (&pg_inputs)[N],
const Ti (&zn_inputs)[N],
const Ti (&zm_inputs)[N],
const Te (&zd_expected)[N],
FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
VIXL_ASSERT((macro == NULL) ^ (macro_nonan == NULL));
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Avoid choosing default scratch registers.
ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
ZRegister zm = z28.WithLaneSize(lane_size_in_bits);
uint64_t zn_inputs_rawbits[N];
uint64_t zm_inputs_rawbits[N];
uint64_t zd_inputs_rawbits[N];
FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
FPToRawbitsWithSize(zd_inputs, zd_inputs_rawbits, lane_size_in_bits);
InsrHelper(&masm, zn, zn_inputs_rawbits);
InsrHelper(&masm, zm, zm_inputs_rawbits);
InsrHelper(&masm, zd, zd_inputs_rawbits);
PRegisterWithLaneSize pg = p0.WithLaneSize(lane_size_in_bits);
Initialise(&masm, pg, pg_inputs);
// `instr` zdn, pg, zdn, zm
ZRegister dn_result = z0.WithLaneSize(lane_size_in_bits);
__ Mov(dn_result, zn);
if (macro_nonan == NULL) {
(masm.*macro)(dn_result, pg.Merging(), dn_result, zm, nan_option);
} else {
(masm.*macro_nonan)(dn_result, pg.Merging(), dn_result, zm);
}
// Based on whether zd and zm registers are aliased, the macro of instructions
// (`Instr`) swaps the order of operands if it has the commutative property,
// otherwise, transfer to the reversed `Instr`, such as fdivr.
// `instr` zdm, pg, zn, zdm
ZRegister dm_result = z1.WithLaneSize(lane_size_in_bits);
__ Mov(dm_result, zm);
if (macro_nonan == NULL) {
(masm.*macro)(dm_result, pg.Merging(), zn, dm_result, nan_option);
} else {
(masm.*macro_nonan)(dm_result, pg.Merging(), zn, dm_result);
}
// The macro of instructions (`Instr`) automatically selects between `instr`
// and movprfx + `instr` based on whether zd and zn registers are aliased.
// A generated movprfx instruction is predicated that using the same
// governing predicate register. In order to keep the result constant,
// initialize the destination register first.
// `instr` zd, pg, zn, zm
ZRegister d_result = z2.WithLaneSize(lane_size_in_bits);
__ Mov(d_result, zd);
if (macro_nonan == NULL) {
(masm.*macro)(d_result, pg.Merging(), zn, zm, nan_option);
} else {
(masm.*macro_nonan)(d_result, pg.Merging(), zn, zm);
}
END();
if (CAN_RUN()) {
RUN();
for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
if (!core.HasSVELane(dn_result, lane)) break;
if ((pg_inputs[i] & 1) != 0) {
ASSERT_EQUAL_SVE_LANE(zd_expected[i], dn_result, lane);
} else {
ASSERT_EQUAL_SVE_LANE(zn_inputs_rawbits[i], dn_result, lane);
}
}
for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
if (!core.HasSVELane(dm_result, lane)) break;
if ((pg_inputs[i] & 1) != 0) {
ASSERT_EQUAL_SVE_LANE(zd_expected[i], dm_result, lane);
} else {
ASSERT_EQUAL_SVE_LANE(zm_inputs_rawbits[i], dm_result, lane);
}
}
ASSERT_EQUAL_SVE(zd_expected, d_result);
}
}
TEST_SVE(sve_binary_arithmetic_predicated_fdiv) {
// The inputs are shared with different precision tests.
double zd_in[] = {0.1, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9};
double zn_in[] = {24.0,
24.0,
-2.0,
-2.0,
5.5,
5.5,
kFP64PositiveInfinity,
kFP64PositiveInfinity,
kFP64NegativeInfinity,
kFP64NegativeInfinity};
double zm_in[] = {-2.0, -2.0, 24.0, 24.0, 0.5, 0.5, 0.65, 0.65, 24.0, 24.0};
int pg_in[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
uint16_t exp_h[] = {Float16ToRawbits(Float16(0.1)),
Float16ToRawbits(Float16(-12.0)),
Float16ToRawbits(Float16(2.2)),
Float16ToRawbits(Float16(-0.0833)),
Float16ToRawbits(Float16(4.4)),
Float16ToRawbits(Float16(11.0)),
Float16ToRawbits(Float16(6.6)),
Float16ToRawbits(kFP16PositiveInfinity),
Float16ToRawbits(Float16(8.8)),
Float16ToRawbits(kFP16NegativeInfinity)};
FPBinArithHelper(config,
NULL,
&MacroAssembler::Fdiv,
kHRegSize,
zd_in,
pg_in,
zn_in,
zm_in,
exp_h);
uint32_t exp_s[] = {FloatToRawbits(0.1),
FloatToRawbits(-12.0),
FloatToRawbits(2.2),
0xbdaaaaab,
FloatToRawbits(4.4),
FloatToRawbits(11.0),
FloatToRawbits(6.6),
FloatToRawbits(kFP32PositiveInfinity),
FloatToRawbits(8.8),
FloatToRawbits(kFP32NegativeInfinity)};
FPBinArithHelper(config,
NULL,
&MacroAssembler::Fdiv,
kSRegSize,
zd_in,
pg_in,
zn_in,
zm_in,
exp_s);
uint64_t exp_d[] = {DoubleToRawbits(0.1),
DoubleToRawbits(-12.0),
DoubleToRawbits(2.2),
0xbfb5555555555555,
DoubleToRawbits(4.4),
DoubleToRawbits(11.0),
DoubleToRawbits(6.6),
DoubleToRawbits(kFP64PositiveInfinity),
DoubleToRawbits(8.8),
DoubleToRawbits(kFP64NegativeInfinity)};
FPBinArithHelper(config,
NULL,
&MacroAssembler::Fdiv,
kDRegSize,
zd_in,
pg_in,
zn_in,
zm_in,
exp_d);
}
TEST_SVE(sve_select) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in0[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
uint64_t in1[] = {0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa};
// For simplicity, we re-use the same pg for various lane sizes.
// For D lanes: 1, 1, 0
// For S lanes: 1, 1, 1, 0, 0
// For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
Initialise(&masm, p0.VnB(), pg_in);
PRegisterM pg = p0.Merging();
InsrHelper(&masm, z30.VnD(), in0);
InsrHelper(&masm, z31.VnD(), in1);
__ Sel(z0.VnB(), pg, z30.VnB(), z31.VnB());
__ Sel(z1.VnH(), pg, z30.VnH(), z31.VnH());
__ Sel(z2.VnS(), pg, z30.VnS(), z31.VnS());
__ Sel(z3.VnD(), pg, z30.VnD(), z31.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z0[] = {0xaaaaaaaa05aa07f8,
0xfeaaaaf0aac3870f,
0xaaaa56aa9abcdeaa};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
uint64_t expected_z1[] = {0xaaaaaaaaaaaa07f8,
0xaaaaf8f0e1c3870f,
0xaaaaaaaa9abcaaaa};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
uint64_t expected_z2[] = {0xaaaaaaaa05f607f8,
0xfefcf8f0e1c3870f,
0xaaaaaaaaaaaaaaaa};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0x01f203f405f607f8,
0xfefcf8f0e1c3870f,
0xaaaaaaaaaaaaaaaa};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
}
}
TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h) {
double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
double zn_inputs[] = {-2.1,
8.5,
225.5,
0.0,
8.8,
-4.75,
kFP64PositiveInfinity,
kFP64NegativeInfinity};
double zm_inputs[] = {-2.0,
-13.0,
24.0,
0.01,
0.5,
300.75,
kFP64NegativeInfinity,
kFP64PositiveInfinity};
int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
uint16_t zd_expected_max[] = {Float16ToRawbits(Float16(-2.0)),
Float16ToRawbits(Float16(8.5)),
Float16ToRawbits(Float16(3.3)),
Float16ToRawbits(Float16(0.01)),
Float16ToRawbits(Float16(5.5)),
Float16ToRawbits(Float16(300.75)),
Float16ToRawbits(kFP16PositiveInfinity),
Float16ToRawbits(kFP16PositiveInfinity)};
FPBinArithHelper(config,
&MacroAssembler::Fmax,
NULL,
kHRegSize,
zd_inputs,
pg_inputs,
zn_inputs,
zm_inputs,
zd_expected_max);
uint16_t zd_expected_min[] = {Float16ToRawbits(Float16(-2.1)),
Float16ToRawbits(Float16(-13.0)),
Float16ToRawbits(Float16(3.3)),
Float16ToRawbits(Float16(0.0)),
Float16ToRawbits(Float16(5.5)),
Float16ToRawbits(Float16(-4.75)),
Float16ToRawbits(kFP16NegativeInfinity),
Float16ToRawbits(kFP16NegativeInfinity)};
FPBinArithHelper(config,
&MacroAssembler::Fmin,
NULL,
kHRegSize,
zd_inputs,
pg_inputs,
zn_inputs,
zm_inputs,
zd_expected_min);
}
TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s) {
double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
double zn_inputs[] = {-2.1,
8.5,
225.5,
0.0,
8.8,
-4.75,
kFP64PositiveInfinity,
kFP64NegativeInfinity};
double zm_inputs[] = {-2.0,
-13.0,
24.0,
0.01,
0.5,
300.75,
kFP64NegativeInfinity,
kFP64PositiveInfinity};
int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
uint32_t zd_expected_max[] = {FloatToRawbits(-2.0),
FloatToRawbits(8.5),
FloatToRawbits(3.3),
FloatToRawbits(0.01),
FloatToRawbits(5.5),
FloatToRawbits(300.75),
FloatToRawbits(kFP32PositiveInfinity),
FloatToRawbits(kFP32PositiveInfinity)};
FPBinArithHelper(config,
&MacroAssembler::Fmax,
NULL,
kSRegSize,
zd_inputs,
pg_inputs,
zn_inputs,
zm_inputs,
zd_expected_max);
uint32_t zd_expected_min[] = {FloatToRawbits(-2.1),
FloatToRawbits(-13.0),
FloatToRawbits(3.3),
FloatToRawbits(0.0),
FloatToRawbits(5.5),
FloatToRawbits(-4.75),
FloatToRawbits(kFP32NegativeInfinity),
FloatToRawbits(kFP32NegativeInfinity)};
FPBinArithHelper(config,
&MacroAssembler::Fmin,
NULL,
kSRegSize,
zd_inputs,
pg_inputs,
zn_inputs,
zm_inputs,
zd_expected_min);
}
TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d) {
double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
double zn_inputs[] = {-2.1,
8.5,
225.5,
0.0,
8.8,
-4.75,
kFP64PositiveInfinity,
kFP64NegativeInfinity};
double zm_inputs[] = {-2.0,
-13.0,
24.0,
0.01,
0.5,
300.75,
kFP64NegativeInfinity,
kFP64PositiveInfinity};
int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
uint64_t zd_expected_max[] = {DoubleToRawbits(-2.0),
DoubleToRawbits(8.5),
DoubleToRawbits(3.3),
DoubleToRawbits(0.01),
DoubleToRawbits(5.5),
DoubleToRawbits(300.75),
DoubleToRawbits(kFP64PositiveInfinity),
DoubleToRawbits(kFP64PositiveInfinity)};
FPBinArithHelper(config,
&MacroAssembler::Fmax,
NULL,
kDRegSize,
zd_inputs,
pg_inputs,
zn_inputs,
zm_inputs,
zd_expected_max);
uint64_t zd_expected_min[] = {DoubleToRawbits(-2.1),
DoubleToRawbits(-13.0),
DoubleToRawbits(3.3),
DoubleToRawbits(0.0),
DoubleToRawbits(5.5),
DoubleToRawbits(-4.75),
DoubleToRawbits(kFP64NegativeInfinity),
DoubleToRawbits(kFP64NegativeInfinity)};
FPBinArithHelper(config,
&MacroAssembler::Fmin,
NULL,
kDRegSize,
zd_inputs,
pg_inputs,
zn_inputs,
zm_inputs,
zd_expected_min);
}
template <typename T, size_t N>
static void BitwiseShiftImmHelper(Test* config,
int lane_size_in_bits,
const T (&zn_inputs)[N],
int shift) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
ZRegister zd_asr = z25.WithLaneSize(lane_size_in_bits);
ZRegister zd_lsr = z26.WithLaneSize(lane_size_in_bits);
ZRegister zd_lsl = z27.WithLaneSize(lane_size_in_bits);
ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
InsrHelper(&masm, zn, zn_inputs);
__ Asr(zd_asr, zn, shift);
__ Lsr(zd_lsr, zn, shift);
__ Lsl(zd_lsl, zn, shift - 1); // Lsl supports 0 - lane_size-1.
END();
if (CAN_RUN()) {
RUN();
const uint64_t mask = GetUintMask(lane_size_in_bits);
for (int i = 0; i < static_cast<int>(N); i++) {
int lane = N - i - 1;
if (!core.HasSVELane(zd_asr, lane)) break;
bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
uint64_t result;
if (shift >= lane_size_in_bits) {
result = is_negative ? mask : 0;
} else {
result = zn_inputs[i] >> shift;
if (is_negative) {
result |= mask << (lane_size_in_bits - shift);
result &= mask;
}
}
ASSERT_EQUAL_SVE_LANE(result, zd_asr, lane);
}
for (int i = 0; i < static_cast<int>(N); i++) {
int lane = N - i - 1;
if (!core.HasSVELane(zd_lsr, lane)) break;
uint64_t result =
(shift >= lane_size_in_bits) ? 0 : zn_inputs[i] >> shift;
ASSERT_EQUAL_SVE_LANE(result, zd_lsr, lane);
}
for (int i = 0; i < static_cast<int>(N); i++) {
int lane = N - i - 1;
if (!core.HasSVELane(zd_lsl, lane)) break;
uint64_t result =
(shift > lane_size_in_bits) ? 0 : zn_inputs[i] << (shift - 1);
ASSERT_EQUAL_SVE_LANE(result & mask, zd_lsl, lane);
}
}
}
TEST_SVE(sve_bitwise_shift_imm_unpredicated) {
uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
int shift_b[] = {1, 3, 5, 8};
for (size_t i = 0; i < ArrayLength(shift_b); i++) {
BitwiseShiftImmHelper(config, kBRegSize, inputs_b, shift_b[i]);
}
uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233};
int shift_h[] = {1, 8, 11, 16};
for (size_t i = 0; i < ArrayLength(shift_h); i++) {
BitwiseShiftImmHelper(config, kHRegSize, inputs_h, shift_h[i]);
}
uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233};
int shift_s[] = {1, 9, 17, 32};
for (size_t i = 0; i < ArrayLength(shift_s); i++) {
BitwiseShiftImmHelper(config, kSRegSize, inputs_s, shift_s[i]);
}
uint64_t inputs_d[] = {0xfedcba98fedcba98,
0xfffa5555aaaaaaaa,
0x0011223344aafe80};
int shift_d[] = {1, 23, 45, 64};
for (size_t i = 0; i < ArrayLength(shift_d); i++) {
BitwiseShiftImmHelper(config, kDRegSize, inputs_d, shift_d[i]);
}
}
template <typename T, typename R, size_t N>
static void BitwiseShiftWideElementsHelper(Test* config,
Shift shift_type,
int lane_size_in_bits,
const T (&zn_inputs)[N],
const R& zm_inputs,
const T (&zd_expected)[N]) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
ArithFn macro;
// Since logical shift left and right by the current lane size width is equal
// to 0, so initialize the array to 0 for convenience.
uint64_t zd_expected_max_shift_amount[N] = {0};
switch (shift_type) {
case ASR: {
macro = &MacroAssembler::Asr;
uint64_t mask = GetUintMask(lane_size_in_bits);
for (size_t i = 0; i < ArrayLength(zn_inputs); i++) {
bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
zd_expected_max_shift_amount[i] = is_negative ? mask : 0;
}
break;
}
case LSR:
macro = &MacroAssembler::Lsr;
break;
case LSL:
macro = &MacroAssembler::Lsl;
break;
default:
VIXL_UNIMPLEMENTED();
macro = NULL;
break;
}
ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
ZRegister zm = z28.WithLaneSize(kDRegSize);
InsrHelper(&masm, zn, zn_inputs);
InsrHelper(&masm, zm, zm_inputs);
(masm.*macro)(zd, zn, zm);
ZRegister zm_max_shift_amount = z25.WithLaneSize(kDRegSize);
ZRegister zd_max_shift_amount = z24.WithLaneSize(lane_size_in_bits);
__ Dup(zm_max_shift_amount, lane_size_in_bits);
(masm.*macro)(zd_max_shift_amount, zn, zm_max_shift_amount);
ZRegister zm_out_of_range = z23.WithLaneSize(kDRegSize);
ZRegister zd_out_of_range = z22.WithLaneSize(lane_size_in_bits);
__ Dup(zm_out_of_range, GetUintMask(lane_size_in_bits));
(masm.*macro)(zd_out_of_range, zn, zm_out_of_range);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zd_expected, zd);
ASSERT_EQUAL_SVE(zd_expected_max_shift_amount, zd_max_shift_amount);
ASSERT_EQUAL_SVE(zd_max_shift_amount, zd_out_of_range);
}
}
TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr) {
// clang-format off
uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
int shift_b[] = {1, 3};
uint64_t expected_b[] = {0xff, 0xee, 0xdd, 0xcc, 0xff, 0x2a, 0xd5, 0xc0,
0xff, 0xfb, 0xf7, 0xf3, 0xff, 0x0a, 0xf5, 0xf0};
BitwiseShiftWideElementsHelper(config,
ASR,
kBRegSize,
inputs_b,
shift_b,
expected_b);
uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
0xfedc, 0xfa55, 0x0011, 0x2233,
0xfedc, 0xfa55, 0x0011, 0x2233};
int shift_h[] = {1, 8, 11};
uint64_t expected_h[] = {0xff6e, 0xfd2a, 0x0008, 0x1119,
0xfffe, 0xfffa, 0x0000, 0x0022,
0xffff, 0xffff, 0x0000, 0x0004};
BitwiseShiftWideElementsHelper(config,
ASR,
kHRegSize,
inputs_h,
shift_h,
expected_h);
uint64_t inputs_s[] =
{0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
int shift_s[] = {1, 9, 23};
uint64_t expected_s[] =
{0xff6e5d4c, 0xfffd2ad5, 0x00000891, 0x000091a2, 0xffffff55, 0xffffff11};
BitwiseShiftWideElementsHelper(config,
ASR,
kSRegSize,
inputs_s,
shift_s,
expected_s);
// clang-format on
}
TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr) {
// clang-format off
uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
int shift_b[] = {1, 3};
uint64_t expected_b[] = {0x7f, 0x6e, 0x5d, 0x4c, 0x7f, 0x2a, 0x55, 0x40,
0x1f, 0x1b, 0x17, 0x13, 0x1f, 0x0a, 0x15, 0x10};
BitwiseShiftWideElementsHelper(config,
LSR,
kBRegSize,
inputs_b,
shift_b,
expected_b);
uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
0xfedc, 0xfa55, 0x0011, 0x2233,
0xfedc, 0xfa55, 0x0011, 0x2233};
int shift_h[] = {1, 8, 11};
uint64_t expected_h[] = {0x7f6e, 0x7d2a, 0x0008, 0x1119,
0x00fe, 0x00fa, 0x0000, 0x0022,
0x001f, 0x001f, 0x0000, 0x0004};
BitwiseShiftWideElementsHelper(config,
LSR,
kHRegSize,
inputs_h,
shift_h,
expected_h);
uint64_t inputs_s[] =
{0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
int shift_s[] = {1, 9, 23};
uint64_t expected_s[] =
{0x7f6e5d4c, 0x7ffd2ad5, 0x00000891, 0x000091a2, 0x00000155, 0x00000111};
BitwiseShiftWideElementsHelper(config,
LSR,
kSRegSize,
inputs_s,
shift_s,
expected_s);
// clang-format on
}
TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl) {
// clang-format off
uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
int shift_b[] = {1, 5};
uint64_t expected_b[] = {0xfc, 0xb8, 0x74, 0x30, 0xfe, 0xaa, 0x54, 0x00,
0xc0, 0x80, 0x40, 0x00, 0xe0, 0xa0, 0x40, 0x00};
BitwiseShiftWideElementsHelper(config,
LSL,
kBRegSize,
inputs_b,
shift_b,
expected_b);
uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
0xfedc, 0xfa55, 0x0011, 0x2233,
0xfedc, 0xfa55, 0x0011, 0x2233};
int shift_h[] = {1, 2, 14};
uint64_t expected_h[] = {0xfdb8, 0xf4aa, 0x0022, 0x4466,
0xfb70, 0xe954, 0x0044, 0x88cc,
0x0000, 0x4000, 0x4000, 0xc000};
BitwiseShiftWideElementsHelper(config,
LSL,
kHRegSize,
inputs_h,
shift_h,
expected_h);
uint64_t inputs_s[] =
{0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
int shift_s[] = {1, 19, 26};
uint64_t expected_s[] =
{0xfdb97530, 0xfff4ab54, 0x11980000, 0x2b380000, 0xa8000000, 0x20000000};
BitwiseShiftWideElementsHelper(config,
LSL,
kSRegSize,
inputs_s,
shift_s,
expected_s);
// Test large shifts outside the range of the "unsigned" type.
uint64_t inputs_b2[] = {1, 2, 4, 8, 3, 5, 7, 9,
1, 2, 4, 8, 3, 5, 7, 9};
uint64_t shift_b2[] = {1, 0x1000000001};
uint64_t expected_b2[] = {2, 4, 8, 16, 6, 10, 14, 18,
0, 0, 0, 0, 0, 0, 0, 0};
BitwiseShiftWideElementsHelper(config, LSL, kBRegSize, inputs_b2, shift_b2,
expected_b2);
// clang-format on
}
TEST_SVE(sve_shift_by_vector) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
__ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
__ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
__ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
__ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
__ Dup(z31.VnD(), 0x8000000080008080);
__ Dup(z0.VnB(), -1);
__ Index(z1.VnB(), 0, 1);
__ Dup(z2.VnB(), 0x55);
__ Lsr(z2.VnB(), p2.Merging(), z0.VnB(), z1.VnB());
__ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnB());
__ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnB());
__ Index(z1.VnH(), 0, 1);
__ Dup(z6.VnB(), 0x55);
__ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnH());
__ Lsl(z6.VnH(), p3.Merging(), z0.VnH(), z1.VnH());
__ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnH());
__ Index(z1.VnS(), 0, 1);
__ Dup(z10.VnB(), 0x55);
__ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
__ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
__ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnS());
__ Index(z1.VnD(), 0, 1);
__ Lsr(z0.VnD(), p5.Merging(), z0.VnD(), z1.VnD());
__ Lsl(z12.VnD(), p0.Merging(), z0.VnD(), z1.VnD());
__ Asr(z13.VnD(), p0.Merging(), z31.VnD(), z1.VnD());
__ Dup(z11.VnD(), 0x100000001);
__ Lsl(z14.VnD(), p0.Merging(), z1.VnD(), z11.VnD());
__ Index(z0.VnH(), 7, -1);
__ Lsr(z0.VnH(), p0.Merging(), z31.VnH(), z0.VnH());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z0[] = {0x8000000020001010, 0x0800000002000101};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
uint64_t expected_z2[] = {0x5500550055005500, 0x5503550f553f55ff};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0x0000000000000000, 0x80c0e0f0f8fcfeff};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0xff000000ff00ffff, 0xff000000f000c080};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x01ff03ff07ff0fff, 0x1fff3fff7fffffff};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x5555ffc05555fff0, 0x5555fffc5555ffff};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0xff000000fc00f808, 0xf0000000c0008080};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0x1fffffff3fffffff, 0x7fffffffffffffff};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0xfffffff8fffffffc, 0xfffffffeffffffff};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] = {0x55555555e0002020, 0x5555555580008080};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
uint64_t expected_z12[] = {0xfffffffffffffffe, 0xffffffffffffffff};
ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
uint64_t expected_z13[] = {0xc000000040004040, 0x8000000080008080};
ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
uint64_t expected_z14[] = {0, 0};
ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
}
}
TEST_SVE(sve_shift_by_wide_vector) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
__ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
__ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
__ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
__ Dup(z31.VnD(), 0x8000000080008080);
__ Dup(z0.VnB(), -1);
__ Index(z1.VnD(), 1, 5);
__ Dup(z2.VnB(), 0x55);
__ Lsr(z2.VnB(), p2.Merging(), z2.VnB(), z1.VnD());
__ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnD());
__ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnD());
__ Dup(z6.VnB(), 0x55);
__ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnD());
__ Lsl(z6.VnH(), p3.Merging(), z6.VnH(), z1.VnD());
__ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnD());
__ Dup(z10.VnB(), 0x55);
__ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
__ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
__ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z2[] = {0x5501550155015501, 0x552a552a552a552a};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0xc0c0c0c0c0c0c0c0, 0xfefefefefefefefe};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0xfe000000fe00fefe, 0xc0000000c000c0c0};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x03ff03ff03ff03ff, 0x7fff7fff7fff7fff};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x5555554055555540, 0x5555aaaa5555aaaa};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0xfe000000fe00fe02, 0xc0000000c000c040};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0x03ffffff03ffffff, 0x7fffffff7fffffff};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0xffffffc0ffffffc0, 0xfffffffefffffffe};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] = {0x55555555fe000202, 0x55555555c0004040};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
}
}
TEST_SVE(sve_pred_shift_imm) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
__ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
__ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
__ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
__ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
__ Dup(z31.VnD(), 0x8000000080008080);
__ Lsr(z0.VnB(), p0.Merging(), z31.VnB(), 1);
__ Mov(z1, z0);
__ Lsl(z1.VnB(), p2.Merging(), z1.VnB(), 1);
__ Asr(z2.VnB(), p0.Merging(), z1.VnB(), 2);
__ Lsr(z3.VnH(), p0.Merging(), z31.VnH(), 2);
__ Mov(z4, z3);
__ Lsl(z4.VnH(), p3.Merging(), z4.VnH(), 2);
__ Asr(z5.VnH(), p0.Merging(), z4.VnH(), 3);
__ Lsr(z6.VnS(), p0.Merging(), z31.VnS(), 3);
__ Mov(z7, z6);
__ Lsl(z7.VnS(), p4.Merging(), z7.VnS(), 3);
__ Asr(z8.VnS(), p0.Merging(), z7.VnS(), 4);
__ Lsr(z9.VnD(), p0.Merging(), z31.VnD(), 4);
__ Mov(z10, z9);
__ Lsl(z10.VnD(), p5.Merging(), z10.VnD(), 4);
__ Asr(z11.VnD(), p0.Merging(), z10.VnD(), 5);
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z0[] = {0x4000000040004040, 0x4000000040004040};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
uint64_t expected_z1[] = {0x4000000040004080, 0x4000000040004080};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
uint64_t expected_z2[] = {0x10000000100010e0, 0x10000000100010e0};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0x2000000020002020, 0x2000000020002020};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0x2000000020008080, 0x2000000020008080};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x040000000400f010, 0x040000000400f010};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x1000000010001010, 0x1000000010001010};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0x1000000080008080, 0x1000000080008080};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0x01000000f8000808, 0x01000000f8000808};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0x0800000008000808, 0x0800000008000808};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] = {0x0800000008000808, 0x8000000080008080};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
uint64_t expected_z11[] = {0x0040000000400040, 0xfc00000004000404};
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
}
}
TEST_SVE(sve_asrd) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
__ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
__ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
__ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
__ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
__ Index(z31.VnB(), 0x7f - 3, 1);
__ Asrd(z0.VnB(), p0.Merging(), z31.VnB(), 1);
__ Mov(z1, z31);
__ Asrd(z1.VnB(), p2.Merging(), z1.VnB(), 2);
__ Asrd(z2.VnB(), p0.Merging(), z31.VnB(), 7);
__ Asrd(z3.VnB(), p0.Merging(), z31.VnB(), 8);
__ Index(z31.VnH(), 0x7fff - 3, 1);
__ Asrd(z4.VnH(), p0.Merging(), z31.VnH(), 1);
__ Mov(z5, z31);
__ Asrd(z5.VnH(), p3.Merging(), z5.VnH(), 2);
__ Asrd(z6.VnH(), p0.Merging(), z31.VnH(), 15);
__ Asrd(z7.VnH(), p0.Merging(), z31.VnH(), 16);
__ Index(z31.VnS(), 0x7fffffff - 1, 1);
__ Asrd(z8.VnS(), p0.Merging(), z31.VnS(), 1);
__ Mov(z9, z31);
__ Asrd(z9.VnS(), p4.Merging(), z9.VnS(), 2);
__ Asrd(z10.VnS(), p0.Merging(), z31.VnS(), 31);
__ Asrd(z11.VnS(), p0.Merging(), z31.VnS(), 32);
__ Index(z31.VnD(), 0x7fffffffffffffff, 1);
__ Asrd(z12.VnD(), p0.Merging(), z31.VnD(), 1);
__ Mov(z13, z31);
__ Asrd(z13.VnD(), p5.Merging(), z13.VnD(), 2);
__ Asrd(z14.VnD(), p0.Merging(), z31.VnD(), 63);
__ Asrd(z31.VnD(), p0.Merging(), z31.VnD(), 64);
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z0[] = {0xc6c5c5c4c4c3c3c2, 0xc2c1c1c03f3f3e3e};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
uint64_t expected_z1[] = {0x8be389e287e285e1, 0x83e181e07f1f7d1f};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
uint64_t expected_z2[] = {0x0000000000000000, 0x000000ff00000000};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0x0000000000000000, 0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0xc002c001c001c000, 0x3fff3fff3ffe3ffe};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x8003e0018001e000, 0x7fff1fff7ffd1fff};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x000000000000ffff, 0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0x0000000000000000, 0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0xc0000001c0000000, 0x3fffffff3fffffff};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0x80000001e0000000, 0x7fffffff1fffffff};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] = {0x00000000ffffffff, 0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
uint64_t expected_z11[] = {0x0000000000000000, 0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
uint64_t expected_z12[] = {0xc000000000000000, 0x3fffffffffffffff};
ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
uint64_t expected_z13[] = {0x8000000000000000, 0x1fffffffffffffff};
ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
uint64_t expected_z14[] = {0xffffffffffffffff, 0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
uint64_t expected_z31[] = {0x0000000000000000, 0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z31, z31.VnD());
}
}
TEST_SVE(sve_setffr) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p15.VnB());
__ Setffr();
__ Rdffr(p14.VnB());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(p14.VnB(), p15.VnB());
}
}
static void WrffrHelper(Test* config, unsigned active_lanes) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int inputs[kPRegMaxSize] = {0};
VIXL_ASSERT(active_lanes <= kPRegMaxSize);
for (unsigned i = 0; i < active_lanes; i++) {
// The rightmost (highest-indexed) array element maps to the lowest-numbered
// lane.
inputs[kPRegMaxSize - i - 1] = 1;
}
Initialise(&masm, p1.VnB(), inputs);
__ Wrffr(p1.VnB());
__ Rdffr(p2.VnB());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(p1.VnB(), p2.VnB());
}
}
TEST_SVE(sve_wrffr) {
int active_lanes_inputs[] = {0, 1, 7, 10, 32, 48, kPRegMaxSize};
for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
WrffrHelper(config, active_lanes_inputs[i]);
}
}
template <size_t N>
static void RdffrHelper(Test* config,
size_t active_lanes,
const int (&pg_inputs)[N]) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
VIXL_ASSERT(active_lanes <= kPRegMaxSize);
// The rightmost (highest-indexed) array element maps to the lowest-numbered
// lane.
int pd[kPRegMaxSize] = {0};
for (unsigned i = 0; i < active_lanes; i++) {
pd[kPRegMaxSize - i - 1] = 1;
}
int pg[kPRegMaxSize] = {0};
for (unsigned i = 0; i < N; i++) {
pg[kPRegMaxSize - i - 1] = pg_inputs[i];
}
int pd_expected[kPRegMaxSize] = {0};
for (unsigned i = 0; i < std::min(active_lanes, N); i++) {
int lane = kPRegMaxSize - i - 1;
pd_expected[lane] = pd[lane] & pg[lane];
}
Initialise(&masm, p0.VnB(), pg);
Initialise(&masm, p1.VnB(), pd);
// The unpredicated form of rdffr has been tested in `WrffrHelper`.
__ Wrffr(p1.VnB());
__ Rdffr(p14.VnB(), p0.Zeroing());
__ Rdffrs(p13.VnB(), p0.Zeroing());
__ Mrs(x8, NZCV);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(pd_expected, p14.VnB());
ASSERT_EQUAL_SVE(pd_expected, p13.VnB());
StatusFlags nzcv_expected =
GetPredTestFlags(pd_expected, pg, core.GetSVELaneCount(kBRegSize));
ASSERT_EQUAL_64(nzcv_expected, x8);
}
}
TEST_SVE(sve_rdffr_rdffrs) {
// clang-format off
int active_lanes_inputs[] = {0, 1, 15, 26, 39, 47, kPRegMaxSize};
int pg_inputs_0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int pg_inputs_1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
int pg_inputs_2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
int pg_inputs_3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
int pg_inputs_4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// clang-format on
for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
RdffrHelper(config, active_lanes_inputs[i], pg_inputs_0);
RdffrHelper(config, active_lanes_inputs[i], pg_inputs_1);
RdffrHelper(config, active_lanes_inputs[i], pg_inputs_2);
RdffrHelper(config, active_lanes_inputs[i], pg_inputs_3);
RdffrHelper(config, active_lanes_inputs[i], pg_inputs_4);
}
}
typedef void (MacroAssembler::*BrkpFn)(const PRegisterWithLaneSize& pd,
const PRegisterZ& pg,
const PRegisterWithLaneSize& pn,
const PRegisterWithLaneSize& pm);
template <typename Tg, typename Tn, typename Td>
static void BrkpaBrkpbHelper(Test* config,
BrkpFn macro,
BrkpFn macro_set_flags,
const Tg& pg_inputs,
const Tn& pn_inputs,
const Tn& pm_inputs,
const Td& pd_expected) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
PRegister pg = p15;
PRegister pn = p14;
PRegister pm = p13;
Initialise(&masm, pg.VnB(), pg_inputs);
Initialise(&masm, pn.VnB(), pn_inputs);
Initialise(&masm, pm.VnB(), pm_inputs);
// Initialise NZCV to an impossible value, to check that we actually write it.
__ Mov(x10, NZCVFlag);
__ Msr(NZCV, x10);
(masm.*macro_set_flags)(p0.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
__ Mrs(x0, NZCV);
(masm.*macro)(p1.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(pd_expected, p0.VnB());
// Check that the flags were properly set.
StatusFlags nzcv_expected =
GetPredTestFlags(pd_expected,
pg_inputs,
core.GetSVELaneCount(kBRegSize));
ASSERT_EQUAL_64(nzcv_expected, x0);
ASSERT_EQUAL_SVE(p0.VnB(), p1.VnB());
}
}
template <typename Tg, typename Tn, typename Td>
static void BrkpaHelper(Test* config,
const Tg& pg_inputs,
const Tn& pn_inputs,
const Tn& pm_inputs,
const Td& pd_expected) {
BrkpaBrkpbHelper(config,
&MacroAssembler::Brkpa,
&MacroAssembler::Brkpas,
pg_inputs,
pn_inputs,
pm_inputs,
pd_expected);
}
template <typename Tg, typename Tn, typename Td>
static void BrkpbHelper(Test* config,
const Tg& pg_inputs,
const Tn& pn_inputs,
const Tn& pm_inputs,
const Td& pd_expected) {
BrkpaBrkpbHelper(config,
&MacroAssembler::Brkpb,
&MacroAssembler::Brkpbs,
pg_inputs,
pn_inputs,
pm_inputs,
pd_expected);
}
TEST_SVE(sve_brkpb) {
// clang-format off
// The last active element of `pn` are `true` in all vector length configurations.
// | boundary of 128-bits VL.
// v
int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
// | highest-numbered lane lowest-numbered lane |
// v v
int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
// | first active
// v
int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
// | first active
// v
int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
// | first active
// v
int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
BrkpbHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
BrkpbHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
BrkpbHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
// | first active
// v
int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
// | first active
// v
int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
// | first active
// v
int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
BrkpbHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
BrkpbHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
BrkpbHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
// | first active
// v
int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
// | first active
// v
int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
// | first active
// v
int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
BrkpbHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
BrkpbHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
BrkpbHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
// The last active element of `pn` are `false` in all vector length configurations.
// | last active lane when VL > 128 bits.
// v
// | last active lane when VL == 128 bits.
// v
int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
BrkpbHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
BrkpbHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
BrkpbHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
// clang-format on
}
TEST_SVE(sve_brkpa) {
// clang-format off
// The last active element of `pn` are `true` in all vector length configurations.
// | boundary of 128-bits VL.
// v
int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
// | highest-numbered lane lowest-numbered lane |
// v v
int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
// | first active
// v
int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
// | first active
// v
int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
// | first active
// v
int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
BrkpaHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
BrkpaHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
BrkpaHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
// | first active
// v
int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
// | first active
// v
int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
// | first active
// v
int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
BrkpaHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
BrkpaHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
BrkpaHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
// | first active
// v
int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
// | first active
// v
int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
// | first active
// v
int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
BrkpaHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
BrkpaHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
BrkpaHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
// The last active element of `pn` are `false` in all vector length configurations.
// | last active lane when VL > 128 bits.
// v
// | last active lane when VL == 128 bits.
// v
int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
BrkpaHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
BrkpaHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
BrkpaHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
// clang-format on
}
TEST_SVE(sve_rbit) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
InsrHelper(&masm, z0.VnD(), inputs);
__ Ptrue(p1.VnB());
int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
Initialise(&masm, p2.VnB(), pred);
__ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
__ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
__ Rbit(z1.VnB(), p1.Merging(), z0.VnB());
__ Rbit(z2.VnH(), p1.Merging(), z0.VnH());
__ Rbit(z3.VnS(), p1.Merging(), z0.VnS());
__ Rbit(z4.VnD(), p1.Merging(), z0.VnD());
__ Dup(z5.VnB(), 0x42);
__ Rbit(z5.VnB(), p2.Merging(), z0.VnB());
__ Dup(z6.VnB(), 0x42);
__ Rbit(z6.VnS(), p2.Merging(), z0.VnS());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(inputs, z0.VnD());
uint64_t expected_z1[] = {0x55555555aaaaaaaa, 0x5555aaaa55aa55aa};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
uint64_t expected_z2[] = {0x55555555aaaaaaaa, 0x5555aaaaaa55aa55};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0xaaaa5555aa55aa55};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0xaa55aa55aaaa5555};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x4255425542aa42aa, 0x4255424242aa42aa};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0x42424242aa55aa55};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
}
}
TEST_SVE(sve_rev_bhw) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
InsrHelper(&masm, z0.VnD(), inputs);
__ Ptrue(p1.VnB());
int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
Initialise(&masm, p2.VnB(), pred);
__ Revb(z1.VnH(), p1.Merging(), z0.VnH());
__ Revb(z2.VnS(), p1.Merging(), z0.VnS());
__ Revb(z3.VnD(), p1.Merging(), z0.VnD());
__ Revh(z4.VnS(), p1.Merging(), z0.VnS());
__ Revh(z5.VnD(), p1.Merging(), z0.VnD());
__ Revw(z6.VnD(), p1.Merging(), z0.VnD());
__ Dup(z7.VnB(), 0x42);
__ Revb(z7.VnH(), p2.Merging(), z0.VnH());
__ Dup(z8.VnB(), 0x42);
__ Revh(z8.VnS(), p2.Merging(), z0.VnS());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z1[] = {0xaaaaaaaa55555555, 0xaaaa555555aa55aa};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
uint64_t expected_z2[] = {0xaaaaaaaa55555555, 0x5555aaaa55aa55aa};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0x55aa55aa5555aaaa};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0x5555aaaaaa55aa55};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x55555555aaaaaaaa, 0xaa55aa555555aaaa};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0xaa55aa55aaaa5555};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0xaaaaaaaa55555555, 0xaaaa424255aa55aa};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0xaaaaaaaa55555555, 0x42424242aa55aa55};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
}
}
TEST_SVE(sve_ftssel) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in[] = {0x1111777766665555, 0xaaaabbbbccccdddd};
uint64_t q[] = {0x0001000300000002, 0x0001000200000003};
InsrHelper(&masm, z0.VnD(), in);
InsrHelper(&masm, z1.VnD(), q);
__ Ftssel(z2.VnH(), z0.VnH(), z1.VnH());
__ Ftssel(z3.VnS(), z0.VnS(), z1.VnS());
__ Ftssel(z4.VnD(), z0.VnD(), z1.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z2[] = {0x3c00bc006666d555, 0x3c003bbbccccbc00};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0xbf800000e6665555, 0x2aaabbbbbf800000};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0x9111777766665555, 0xbff0000000000000};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
}
}
TEST_SVE(sve_fexpa) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in0[] = {0x3ff0000000000000, 0x3ff0000000011001};
uint64_t in1[] = {0x3ff000000002200f, 0xbff000000003301f};
uint64_t in2[] = {0xbff000000004403f, 0x3ff0000000055040};
uint64_t in3[] = {0x3f800000bf800001, 0x3f80000f3f80001f};
uint64_t in4[] = {0x3f80002f3f82203f, 0xbf8000403f833041};
uint64_t in5[] = {0x3c003c01bc00bc07, 0x3c08bc0f3c1fbc20};
InsrHelper(&masm, z0.VnD(), in0);
InsrHelper(&masm, z1.VnD(), in1);
InsrHelper(&masm, z2.VnD(), in2);
InsrHelper(&masm, z3.VnD(), in3);
InsrHelper(&masm, z4.VnD(), in4);
InsrHelper(&masm, z5.VnD(), in5);
__ Fexpa(z6.VnD(), z0.VnD());
__ Fexpa(z7.VnD(), z1.VnD());
__ Fexpa(z8.VnD(), z2.VnD());
__ Fexpa(z9.VnS(), z3.VnS());
__ Fexpa(z10.VnS(), z4.VnS());
__ Fexpa(z11.VnH(), z5.VnH());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z6[] = {0x0000000000000000, 0x44002c9a3e778061};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0x0802d285a6e4030b, 0x4c06623882552225};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0x100fa7c1819e90d8, 0x5410000000000000};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0x00000000000164d2, 0x0016942d003311c4};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] = {0x0054f35b407d3e0c, 0x00800000608164d2};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
uint64_t expected_z11[] = {0x00000016000000a8, 0x00c2018903d40400};
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
}
}
TEST_SVE(sve_rev_p) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
Initialise(&masm,
p0.VnB(),
0xabcdabcdabcdabcd,
0xabcdabcdabcdabcd,
0xabcdabcdabcdabcd,
0xabcdabcdabcdabcd);
__ Rev(p1.VnB(), p0.VnB());
__ Rev(p2.VnH(), p0.VnH());
__ Rev(p3.VnS(), p0.VnS());
__ Rev(p4.VnD(), p0.VnD());
END();
if (CAN_RUN()) {
RUN();
int p1_expected[] = {1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1};
ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
int p2_expected[] = {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
int p3_expected[] = {1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
int p4_expected[] = {1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1};
ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
}
}
TEST_SVE(sve_trn_p_bh) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
Initialise(&masm, p0.VnB(), 0xa5a55a5a);
__ Pfalse(p1.VnB());
__ Trn1(p2.VnB(), p0.VnB(), p0.VnB());
__ Trn2(p3.VnB(), p0.VnB(), p0.VnB());
__ Trn1(p4.VnB(), p1.VnB(), p0.VnB());
__ Trn2(p5.VnB(), p1.VnB(), p0.VnB());
__ Trn1(p6.VnB(), p0.VnB(), p1.VnB());
__ Trn2(p7.VnB(), p0.VnB(), p1.VnB());
__ Trn1(p8.VnH(), p0.VnH(), p0.VnH());
__ Trn2(p9.VnH(), p0.VnH(), p0.VnH());
__ Trn1(p10.VnH(), p1.VnH(), p0.VnH());
__ Trn2(p11.VnH(), p1.VnH(), p0.VnH());
__ Trn1(p12.VnH(), p0.VnH(), p1.VnH());
__ Trn2(p13.VnH(), p0.VnH(), p1.VnH());
END();
if (CAN_RUN()) {
RUN();
int p2_expected[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
int p3_expected[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
int p5_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0};
int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
int p10_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
int p11_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
int p12_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
int p13_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
}
}
TEST_SVE(sve_trn_p_sd) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
Initialise(&masm, p0.VnB(), 0x55a55aaa);
__ Pfalse(p1.VnB());
__ Trn1(p2.VnS(), p0.VnS(), p0.VnS());
__ Trn2(p3.VnS(), p0.VnS(), p0.VnS());
__ Trn1(p4.VnS(), p1.VnS(), p0.VnS());
__ Trn2(p5.VnS(), p1.VnS(), p0.VnS());
__ Trn1(p6.VnS(), p0.VnS(), p1.VnS());
__ Trn2(p7.VnS(), p0.VnS(), p1.VnS());
__ Trn1(p8.VnD(), p0.VnD(), p0.VnD());
__ Trn2(p9.VnD(), p0.VnD(), p0.VnD());
__ Trn1(p10.VnD(), p1.VnD(), p0.VnD());
__ Trn2(p11.VnD(), p1.VnD(), p0.VnD());
__ Trn1(p12.VnD(), p0.VnD(), p1.VnD());
__ Trn2(p13.VnD(), p0.VnD(), p1.VnD());
END();
if (CAN_RUN()) {
RUN();
int p2_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
int p6_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
int p8_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
int p10_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0};
int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
}
}
TEST_SVE(sve_zip_p_bh) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
Initialise(&masm,
p0.VnB(),
0x5a5a5a5a5a5a5a5a,
0x5a5a5a5a5a5a5a5a,
0x5a5a5a5a5a5a5a5a,
0x5a5a5a5a5a5a5a5a);
__ Pfalse(p1.VnB());
__ Zip1(p2.VnB(), p0.VnB(), p0.VnB());
__ Zip2(p3.VnB(), p0.VnB(), p0.VnB());
__ Zip1(p4.VnB(), p1.VnB(), p0.VnB());
__ Zip2(p5.VnB(), p1.VnB(), p0.VnB());
__ Zip1(p6.VnB(), p0.VnB(), p1.VnB());
__ Zip2(p7.VnB(), p0.VnB(), p1.VnB());
__ Zip1(p8.VnH(), p0.VnH(), p0.VnH());
__ Zip2(p9.VnH(), p0.VnH(), p0.VnH());
__ Zip1(p10.VnH(), p1.VnH(), p0.VnH());
__ Zip2(p11.VnH(), p1.VnH(), p0.VnH());
__ Zip1(p12.VnH(), p0.VnH(), p1.VnH());
__ Zip2(p13.VnH(), p0.VnH(), p1.VnH());
END();
if (CAN_RUN()) {
RUN();
int p2_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
int p3_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
int p4_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
int p5_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
int p6_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
int p7_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
int p8_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
int p9_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
int p10_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
int p11_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
int p12_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
int p13_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
}
}
TEST_SVE(sve_zip_p_sd) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
Initialise(&masm,
p0.VnB(),
0x5a5a5a5a5a5a5a5a,
0x5a5a5a5a5a5a5a5a,
0x5a5a5a5a5a5a5a5a,
0x5a5a5a5a5a5a5a5a);
__ Pfalse(p1.VnB());
__ Zip1(p2.VnS(), p0.VnS(), p0.VnS());
__ Zip2(p3.VnS(), p0.VnS(), p0.VnS());
__ Zip1(p4.VnS(), p1.VnS(), p0.VnS());
__ Zip2(p5.VnS(), p1.VnS(), p0.VnS());
__ Zip1(p6.VnS(), p0.VnS(), p1.VnS());
__ Zip2(p7.VnS(), p0.VnS(), p1.VnS());
__ Zip1(p8.VnD(), p0.VnD(), p0.VnD());
__ Zip2(p9.VnD(), p0.VnD(), p0.VnD());
__ Zip1(p10.VnD(), p1.VnD(), p0.VnD());
__ Zip2(p11.VnD(), p1.VnD(), p0.VnD());
__ Zip1(p12.VnD(), p0.VnD(), p1.VnD());
__ Zip2(p13.VnD(), p0.VnD(), p1.VnD());
END();
if (CAN_RUN()) {
RUN();
int p2_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
int p4_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
int p6_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
int p10_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
}
}
TEST_SVE(sve_uzp_p) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
Initialise(&masm,
p0.VnB(),
0xf0f0ff00ffff0000,
0x4242424242424242,
0x5a5a5a5a5a5a5a5a,
0x0123456789abcdef);
__ Rev(p1.VnB(), p0.VnB());
__ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
__ Zip2(p3.VnB(), p0.VnB(), p1.VnB());
__ Uzp1(p4.VnB(), p2.VnB(), p3.VnB());
__ Uzp2(p5.VnB(), p2.VnB(), p3.VnB());
__ Zip1(p2.VnH(), p0.VnH(), p1.VnH());
__ Zip2(p3.VnH(), p0.VnH(), p1.VnH());
__ Uzp1(p6.VnH(), p2.VnH(), p3.VnH());
__ Uzp2(p7.VnH(), p2.VnH(), p3.VnH());
__ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
__ Zip2(p3.VnS(), p0.VnS(), p1.VnS());
__ Uzp1(p8.VnS(), p2.VnS(), p3.VnS());
__ Uzp2(p9.VnS(), p2.VnS(), p3.VnS());
__ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
__ Zip2(p3.VnD(), p0.VnD(), p1.VnD());
__ Uzp1(p10.VnD(), p2.VnD(), p3.VnD());
__ Uzp2(p11.VnD(), p2.VnD(), p3.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(p0, p4);
ASSERT_EQUAL_SVE(p1, p5);
ASSERT_EQUAL_SVE(p0, p6);
ASSERT_EQUAL_SVE(p1, p7);
ASSERT_EQUAL_SVE(p0, p8);
ASSERT_EQUAL_SVE(p1, p9);
ASSERT_EQUAL_SVE(p0, p10);
ASSERT_EQUAL_SVE(p1, p11);
}
}
TEST_SVE(sve_punpk) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
auto get_64_bits_at = [](int byte_index) -> uint64_t {
// Each 8-bit chunk has the value 0x50 + the byte index of the chunk.
return 0x5756555453525150 + (0x0101010101010101 * byte_index);
};
Initialise(&masm,
p0.VnB(),
get_64_bits_at(24),
get_64_bits_at(16),
get_64_bits_at(8),
get_64_bits_at(0));
__ Punpklo(p1.VnH(), p0.VnB());
__ Punpkhi(p2.VnH(), p0.VnB());
END();
if (CAN_RUN()) {
RUN();
int pl = config->sve_vl_in_bits() / kZRegBitsPerPRegBit;
// For simplicity, just test the bottom 64 H-sized lanes.
uint64_t p1_h_bits = get_64_bits_at(0);
uint64_t p2_h_bits = get_64_bits_at(pl / (2 * 8));
int p1_expected[64];
int p2_expected[64];
for (size_t i = 0; i < 64; i++) {
p1_expected[63 - i] = (p1_h_bits >> i) & 1;
p2_expected[63 - i] = (p2_h_bits >> i) & 1;
}
// Testing `VnH` ensures that odd-numbered B lanes are zero.
ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
ASSERT_EQUAL_SVE(p2_expected, p2.VnH());
}
}
typedef void (MacroAssembler::*BrkFn)(const PRegisterWithLaneSize& pd,
const PRegister& pg,
const PRegisterWithLaneSize& pn);
typedef void (MacroAssembler::*BrksFn)(const PRegisterWithLaneSize& pd,
const PRegisterZ& pg,
const PRegisterWithLaneSize& pn);
template <typename T, size_t N>
static void BrkaBrkbHelper(Test* config,
BrkFn macro,
BrksFn macro_set_flags,
const T (&pd_inputs)[N],
const T (&pg_inputs)[N],
const T (&pn_inputs)[N],
const T (&pd_z_expected)[N]) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
PRegister pg = p10;
PRegister pn = p9;
PRegister pd_z = p0;
PRegister pd_z_s = p1;
PRegister pd_m = p2;
Initialise(&masm, pg.VnB(), pg_inputs);
Initialise(&masm, pn.VnB(), pn_inputs);
Initialise(&masm, pd_m.VnB(), pd_inputs);
// Initialise NZCV to an impossible value, to check that we actually write it.
__ Mov(x10, NZCVFlag);
__ Msr(NZCV, x10);
(masm.*macro)(pd_z.VnB(), pg.Zeroing(), pn.VnB());
(masm.*macro_set_flags)(pd_z_s.VnB(), pg.Zeroing(), pn.VnB());
__ Mrs(x0, NZCV);
(masm.*macro)(pd_m.VnB(), pg.Merging(), pn.VnB());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(pd_z_expected, pd_z.VnB());
// Check that the flags were properly set.
StatusFlags nzcv_expected =
GetPredTestFlags(pd_z_expected,
pg_inputs,
core.GetSVELaneCount(kBRegSize));
ASSERT_EQUAL_64(nzcv_expected, x0);
ASSERT_EQUAL_SVE(pd_z.VnB(), pd_z_s.VnB());
T pd_m_expected[N];
// Set expected `pd` result on merging predication.
for (size_t i = 0; i < N; i++) {
pd_m_expected[i] = pg_inputs[i] ? pd_z_expected[i] : pd_inputs[i];
}
ASSERT_EQUAL_SVE(pd_m_expected, pd_m.VnB());
}
}
template <typename T>
static void BrkaHelper(Test* config,
const T& pd_inputs,
const T& pg_inputs,
const T& pn_inputs,
const T& pd_expected) {
BrkaBrkbHelper(config,
&MacroAssembler::Brka,
&MacroAssembler::Brkas,
pd_inputs,
pg_inputs,
pn_inputs,
pd_expected);
}
TEST_SVE(sve_brka) {
// clang-format off
// | boundary of 128-bits VL.
// v
int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// | highest-numbered lane lowest-numbered lane |
// v v
int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
// | first break
// v
int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
// | first break
// v
int exp_1_2[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
// | first break
// v
int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
BrkaHelper(config, pd, pg_1, pn_1, exp_1_1);
BrkaHelper(config, pd, pg_1, pn_2, exp_1_2);
BrkaHelper(config, pd, pg_1, pn_3, exp_1_3);
// | first break
// v
int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
// | first break
// v
int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
// | first break
// v
int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
BrkaHelper(config, pd, pg_2, pn_1, exp_2_1);
BrkaHelper(config, pd, pg_2, pn_2, exp_2_2);
BrkaHelper(config, pd, pg_2, pn_3, exp_2_3);
// The all-inactive zeroing predicate sets destination predicate all-false.
int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
BrkaHelper(config, pd, pg_3, pn_1, exp_3_x);
BrkaHelper(config, pd, pg_3, pn_2, exp_3_x);
BrkaHelper(config, pd, pg_3, pn_3, exp_3_x);
// clang-format on
}
template <typename T>
static void BrkbHelper(Test* config,
const T& pd_inputs,
const T& pg_inputs,
const T& pn_inputs,
const T& pd_expected) {
BrkaBrkbHelper(config,
&MacroAssembler::Brkb,
&MacroAssembler::Brkbs,
pd_inputs,
pg_inputs,
pn_inputs,
pd_expected);
}
TEST_SVE(sve_brkb) {
// clang-format off
// | boundary of 128-bits VL.
// v
int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// | highest-numbered lane lowest-numbered lane |
// v v
int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
// | first break
// v
int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
// | first break
// v
int exp_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
// | first break
// v
int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
BrkbHelper(config, pd, pg_1, pn_1, exp_1_1);
BrkbHelper(config, pd, pg_1, pn_2, exp_1_2);
BrkbHelper(config, pd, pg_1, pn_3, exp_1_3);
// | first break
// v
int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
// | first break
// v
int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
// | first break
// v
int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
BrkbHelper(config, pd, pg_2, pn_1, exp_2_1);
BrkbHelper(config, pd, pg_2, pn_2, exp_2_2);
BrkbHelper(config, pd, pg_2, pn_3, exp_2_3);
// The all-inactive zeroing predicate sets destination predicate all-false.
int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
BrkbHelper(config, pd, pg_3, pn_1, exp_3_x);
BrkbHelper(config, pd, pg_3, pn_2, exp_3_x);
BrkbHelper(config, pd, pg_3, pn_3, exp_3_x);
// clang-format on
}
typedef void (MacroAssembler::*BrknFn)(const PRegisterWithLaneSize& pd,
const PRegisterZ& pg,
const PRegisterWithLaneSize& pn,
const PRegisterWithLaneSize& pm);
typedef void (MacroAssembler::*BrknsFn)(const PRegisterWithLaneSize& pd,
const PRegisterZ& pg,
const PRegisterWithLaneSize& pn,
const PRegisterWithLaneSize& pm);
enum BrknDstPredicateState { kAllFalse, kUnchanged };
template <typename T, size_t N>
static void BrknHelper(Test* config,
const T (&pd_inputs)[N],
const T (&pg_inputs)[N],
const T (&pn_inputs)[N],
const T (&pm_inputs)[N],
BrknDstPredicateState expected_pd_state) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
PRegister pg = p10;
PRegister pn = p9;
PRegister pm = p8;
PRegister pdm = p0;
PRegister pd = p1;
PRegister pd_s = p2;
Initialise(&masm, pg.VnB(), pg_inputs);
Initialise(&masm, pn.VnB(), pn_inputs);
Initialise(&masm, pm.VnB(), pm_inputs);
Initialise(&masm, pdm.VnB(), pm_inputs);
Initialise(&masm, pd.VnB(), pd_inputs);
Initialise(&masm, pd_s.VnB(), pd_inputs);
// Initialise NZCV to an impossible value, to check that we actually write it.
__ Mov(x10, NZCVFlag);
__ Msr(NZCV, x10);
__ Brkn(pdm.VnB(), pg.Zeroing(), pn.VnB(), pdm.VnB());
// !pd.Aliases(pm).
__ Brkn(pd.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
__ Brkns(pd_s.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
__ Mrs(x0, NZCV);
END();
if (CAN_RUN()) {
RUN();
T all_false[N] = {0};
if (expected_pd_state == kAllFalse) {
ASSERT_EQUAL_SVE(all_false, pd.VnB());
} else {
ASSERT_EQUAL_SVE(pm_inputs, pd.VnB());
}
ASSERT_EQUAL_SVE(pm_inputs, pm.VnB());
T all_true[N];
for (size_t i = 0; i < ArrayLength(all_true); i++) {
all_true[i] = 1;
}
// Check that the flags were properly set.
StatusFlags nzcv_expected =
GetPredTestFlags((expected_pd_state == kAllFalse) ? all_false
: pm_inputs,
all_true,
core.GetSVELaneCount(kBRegSize));
ASSERT_EQUAL_64(nzcv_expected, x0);
ASSERT_EQUAL_SVE(pd.VnB(), pdm.VnB());
ASSERT_EQUAL_SVE(pd.VnB(), pd_s.VnB());
}
}
TEST_SVE(sve_brkn) {
int pd[] = {1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
int pm[] = {0, 1, 1, 1, 1, 0, 0, 1, 0, 1};
int pg_1[] = {1, 1, 0, 0, 1, 0, 1, 1, 0, 0};
int pg_2[] = {0, 0, 0, 1, 1, 1, 0, 0, 1, 1};
int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int pn_1[] = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0};
int pn_2[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
int pn_3[] = {0, 0, 0, 0, 1, 1, 0, 0, 1, 1};
BrknHelper(config, pd, pg_1, pn_1, pm, kUnchanged);
BrknHelper(config, pd, pg_1, pn_2, pm, kAllFalse);
BrknHelper(config, pd, pg_1, pn_3, pm, kAllFalse);
BrknHelper(config, pd, pg_2, pn_1, pm, kAllFalse);
BrknHelper(config, pd, pg_2, pn_2, pm, kUnchanged);
BrknHelper(config, pd, pg_2, pn_3, pm, kAllFalse);
BrknHelper(config, pd, pg_3, pn_1, pm, kAllFalse);
BrknHelper(config, pd, pg_3, pn_2, pm, kAllFalse);
BrknHelper(config, pd, pg_3, pn_3, pm, kAllFalse);
}
TEST_SVE(sve_trn) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in0[] = {0xffeeddccbbaa9988, 0x7766554433221100};
uint64_t in1[] = {0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa};
InsrHelper(&masm, z0.VnD(), in0);
InsrHelper(&masm, z1.VnD(), in1);
__ Trn1(z2.VnB(), z0.VnB(), z1.VnB());
__ Trn2(z3.VnB(), z0.VnB(), z1.VnB());
__ Trn1(z4.VnH(), z0.VnH(), z1.VnH());
__ Trn2(z5.VnH(), z0.VnH(), z1.VnH());
__ Trn1(z6.VnS(), z0.VnS(), z1.VnS());
__ Trn2(z7.VnS(), z0.VnS(), z1.VnS());
__ Trn1(z8.VnD(), z0.VnD(), z1.VnD());
__ Trn2(z9.VnD(), z0.VnD(), z1.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z2[] = {0x55ee55cc55aa5588, 0xaa66aa44aa22aa00};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0xaaffaaddaabbaa99, 0x5577555555335511};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0xaa55ddccaa559988, 0x55aa554455aa1100};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0xaa55ffeeaa55bbaa, 0x55aa776655aa3322};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0xaa55aa55bbaa9988, 0x55aa55aa33221100};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0x55aa55aa77665544};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
}
}
TEST_SVE(sve_zip_uzp) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Dup(z0.VnD(), 0xffeeddccbbaa9988);
__ Insr(z0.VnD(), 0x7766554433221100);
__ Dup(z1.VnD(), 0xaa55aa55aa55aa55);
__ Insr(z1.VnD(), 0x55aa55aa55aa55aa);
__ Zip1(z2.VnB(), z0.VnB(), z1.VnB());
__ Zip2(z3.VnB(), z0.VnB(), z1.VnB());
__ Zip1(z4.VnH(), z0.VnH(), z1.VnH());
__ Zip2(z5.VnH(), z0.VnH(), z1.VnH());
__ Zip1(z6.VnS(), z0.VnS(), z1.VnS());
__ Zip2(z7.VnS(), z0.VnS(), z1.VnS());
__ Zip1(z8.VnD(), z0.VnD(), z1.VnD());
__ Zip2(z9.VnD(), z0.VnD(), z1.VnD());
__ Uzp1(z10.VnB(), z2.VnB(), z3.VnB());
__ Uzp2(z11.VnB(), z2.VnB(), z3.VnB());
__ Uzp1(z12.VnH(), z4.VnH(), z5.VnH());
__ Uzp2(z13.VnH(), z4.VnH(), z5.VnH());
__ Uzp1(z14.VnS(), z6.VnS(), z7.VnS());
__ Uzp2(z15.VnS(), z6.VnS(), z7.VnS());
__ Uzp1(z16.VnD(), z8.VnD(), z9.VnD());
__ Uzp2(z17.VnD(), z8.VnD(), z9.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z2[] = {0x5577aa665555aa44, 0x5533aa225511aa00};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0xaaff55eeaadd55cc, 0xaabb55aaaa995588};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0x55aa776655aa5544, 0x55aa332255aa1100};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0xaa55ffeeaa55ddcc, 0xaa55bbaaaa559988};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x55aa55aa77665544, 0x55aa55aa33221100};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0xaa55aa55bbaa9988};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
// Check uzp is the opposite of zip.
ASSERT_EQUAL_SVE(z0.VnD(), z10.VnD());
ASSERT_EQUAL_SVE(z1.VnD(), z11.VnD());
ASSERT_EQUAL_SVE(z0.VnD(), z12.VnD());
ASSERT_EQUAL_SVE(z1.VnD(), z13.VnD());
ASSERT_EQUAL_SVE(z0.VnD(), z14.VnD());
ASSERT_EQUAL_SVE(z1.VnD(), z15.VnD());
ASSERT_EQUAL_SVE(z0.VnD(), z16.VnD());
ASSERT_EQUAL_SVE(z1.VnD(), z17.VnD());
}
}
TEST_SVE(sve_fcadd) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Dup(z30.VnS(), 0);
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
__ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
__ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
__ Fdup(z0.VnH(), 10.0); // 10i + 10
__ Fdup(z1.VnH(), 5.0); // 5i + 5
__ Index(z7.VnH(), 1, 1);
__ Scvtf(z7.VnH(), p0.Merging(), z7.VnH()); // Ai + B
__ Sel(z2.VnH(), p3, z1.VnH(), z30.VnH()); // 5i + 0
__ Sel(z3.VnH(), p2, z1.VnH(), z30.VnH()); // 0i + 5
__ Sel(z7.VnH(), p3, z7.VnH(), z0.VnH()); // Ai + 10
__ Mov(z8, z7);
__ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 2);
__ Sel(z8.VnH(), p2, z8.VnH(), z30.VnH()); // 0i + A
// (10i + 10) + rotate(5i + 0, 90)
// = (10i + 10) + (0i - 5)
// = 10i + 5
__ Fcadd(z4.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90);
// (10i + 5) + rotate(0i + 5, 270)
// = (10i + 5) + (-5i + 0)
// = 5i + 5
__ Fcadd(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH(), 270);
// The same calculation, but selecting real/imaginary using predication.
__ Mov(z5, z0);
__ Fcadd(z5.VnH(), p2.Merging(), z5.VnH(), z1.VnH(), 90);
__ Fcadd(z5.VnH(), p3.Merging(), z5.VnH(), z1.VnH(), 270);
// Reference calculation: (10i + 10) - (5i + 5)
__ Fsub(z6.VnH(), z0.VnH(), z1.VnH());
// Calculation using varying imaginary values.
// (Ai + 10) + rotate(5i + 0, 90)
// = (Ai + 10) + (0i - 5)
// = Ai + 5
__ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z2.VnH(), 90);
// (Ai + 5) + rotate(0i + A, 270)
// = (Ai + 5) + (-Ai + 0)
// = 5
__ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z8.VnH(), 270);
// Repeated, but for wider elements.
__ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
__ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
__ Fdup(z0.VnS(), 42.0);
__ Fdup(z1.VnS(), 21.0);
__ Index(z11.VnS(), 1, 1);
__ Scvtf(z11.VnS(), p0.Merging(), z11.VnS());
__ Sel(z2.VnS(), p3, z1.VnS(), z30.VnS());
__ Sel(z29.VnS(), p2, z1.VnS(), z30.VnS());
__ Sel(z11.VnS(), p3, z11.VnS(), z0.VnS());
__ Mov(z12, z11);
__ Ext(z12.VnB(), z12.VnB(), z12.VnB(), 4);
__ Sel(z12.VnS(), p2, z12.VnS(), z30.VnS());
__ Fcadd(z8.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90);
__ Fcadd(z8.VnS(), p0.Merging(), z8.VnS(), z29.VnS(), 270);
__ Mov(z9, z0);
__ Fcadd(z9.VnS(), p2.Merging(), z9.VnS(), z1.VnS(), 90);
__ Fcadd(z9.VnS(), p3.Merging(), z9.VnS(), z1.VnS(), 270);
__ Fsub(z10.VnS(), z0.VnS(), z1.VnS());
__ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z2.VnS(), 90);
__ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z12.VnS(), 270);
__ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
__ Zip1(p3.VnD(), p1.VnD(), p0.VnD());
__ Fdup(z0.VnD(), -42.0);
__ Fdup(z1.VnD(), -21.0);
__ Index(z15.VnD(), 1, 1);
__ Scvtf(z15.VnD(), p0.Merging(), z15.VnD());
__ Sel(z2.VnD(), p3, z1.VnD(), z30.VnD());
__ Sel(z28.VnD(), p2, z1.VnD(), z30.VnD());
__ Sel(z15.VnD(), p3, z15.VnD(), z0.VnD());
__ Mov(z16, z15);
__ Ext(z16.VnB(), z16.VnB(), z16.VnB(), 8);
__ Sel(z16.VnD(), p2, z16.VnD(), z30.VnD());
__ Fcadd(z12.VnD(), p0.Merging(), z0.VnD(), z2.VnD(), 90);
__ Fcadd(z12.VnD(), p0.Merging(), z12.VnD(), z28.VnD(), 270);
__ Mov(z13, z0);
__ Fcadd(z13.VnD(), p2.Merging(), z13.VnD(), z1.VnD(), 90);
__ Fcadd(z13.VnD(), p3.Merging(), z13.VnD(), z1.VnD(), 270);
__ Fsub(z14.VnD(), z0.VnD(), z1.VnD());
__ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z2.VnD(), 90);
__ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z16.VnD(), 270);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
ASSERT_EQUAL_SVE(z3.VnH(), z7.VnH());
ASSERT_EQUAL_SVE(z10.VnS(), z8.VnS());
ASSERT_EQUAL_SVE(z10.VnS(), z9.VnS());
ASSERT_EQUAL_SVE(z29.VnS(), z11.VnS());
ASSERT_EQUAL_SVE(z14.VnD(), z12.VnD());
ASSERT_EQUAL_SVE(z14.VnD(), z13.VnD());
ASSERT_EQUAL_SVE(z28.VnS(), z15.VnS());
}
}
TEST_SVE(sve_fcmla_index) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Fdup(z0.VnH(), 10.0);
__ Fdup(z2.VnH(), 2.0);
__ Zip1(z0.VnH(), z0.VnH(), z2.VnH());
// Duplicate complex numbers across z2 segments. First segment has 1i+0,
// second has 3i+2, etc.
__ Index(z1.VnH(), 0, 1);
__ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
__ Zip1(z2.VnS(), z1.VnS(), z1.VnS());
__ Zip1(z2.VnS(), z2.VnS(), z2.VnS());
// Derive a vector from z2 where only the third element in each segment
// contains a complex number, with other elements zero.
__ Index(z3.VnS(), 0, 1);
__ And(z3.VnS(), z3.VnS(), 3);
__ Cmpeq(p2.VnS(), p0.Zeroing(), z3.VnS(), 2);
__ Dup(z3.VnB(), 0);
__ Sel(z3.VnS(), p2, z2.VnS(), z3.VnS());
// Use indexed complex multiply on this vector, indexing the third element.
__ Dup(z4.VnH(), 0);
__ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 0);
__ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 90);
// Rotate the indexed complex number and repeat, negated, and with a different
// index.
__ Ext(z3.VnH(), z3.VnH(), z3.VnH(), 4);
__ Dup(z5.VnH(), 0);
__ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 180);
__ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 270);
__ Fneg(z5.VnH(), p0.Merging(), z5.VnH());
// Create a reference result from a vector complex multiply.
__ Dup(z6.VnH(), 0);
__ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 0);
__ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 90);
// Repeated, but for wider elements.
__ Fdup(z0.VnS(), 42.0);
__ Fdup(z2.VnS(), 24.0);
__ Zip1(z0.VnS(), z0.VnS(), z2.VnS());
__ Index(z1.VnS(), -42, 13);
__ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
__ Zip1(z2.VnD(), z1.VnD(), z1.VnD());
__ Zip1(z2.VnD(), z2.VnD(), z2.VnD());
__ Index(z3.VnD(), 0, 1);
__ And(z3.VnD(), z3.VnD(), 1);
__ Cmpeq(p2.VnD(), p0.Zeroing(), z3.VnD(), 1);
__ Dup(z3.VnB(), 0);
__ Sel(z3.VnD(), p2, z2.VnD(), z3.VnD());
__ Dup(z7.VnS(), 0);
__ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 0);
__ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 90);
__ Ext(z3.VnB(), z3.VnB(), z3.VnB(), 8);
__ Dup(z8.VnS(), 0);
__ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 180);
__ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 270);
__ Fneg(z8.VnS(), p0.Merging(), z8.VnS());
__ Dup(z9.VnS(), 0);
__ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 0);
__ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 90);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
ASSERT_EQUAL_SVE(z9.VnS(), z7.VnS());
ASSERT_EQUAL_SVE(z9.VnS(), z8.VnS());
}
}
TEST_SVE(sve_fcmla) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
__ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
__ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
__ Fdup(z0.VnH(), 10.0);
__ Fdup(z2.VnH(), 2.0);
// Create pairs of complex numbers, Ai + A. A is chosen to be non-zero, as
// the later fneg will result in a failed comparison otherwise.
__ Index(z1.VnH(), -4, 3);
__ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
__ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
__ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
__ Sel(z3.VnH(), p2, z0.VnH(), z1.VnH()); // Ai + 10
__ Sel(z4.VnH(), p2, z1.VnH(), z2.VnH()); // 2i + A
__ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); // Even complex numbers.
__ Zip1(p3.VnS(), p1.VnS(), p0.VnS()); // Odd complex numbers.
// Calculate (Ai + 10) * (2i + A) = (20 + A^2)i + 8A, using predication to
// select only the complex numbers in odd-numbered element pairs. This leaves
// results in elements 2/3, 6/7, etc. with zero in elements 0/1, 4/5, etc.
// ... 7 6 5 4 3 2 1 0 <-- element
// ... | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | 0 | 0 | <-- value
__ Dup(z5.VnH(), 0);
__ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 0);
__ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 90);
// Move the odd results to the even result positions.
// ... 7 6 5 4 3 2 1 0 <-- element
// ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
__ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 4);
// Calculate -(Ai + 10) * (2i + A) = -(20 + A^2)i - 8A for the even complex
// numbers.
// ... 7 6 5 4 3 2 1 0 <-- element
// ... | 0 | 0 | -20-A^2 | -8A | 0 | 0 | -20-A^2 | -8A | <-- value
__ Dup(z6.VnH(), 0);
__ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 180);
__ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 270);
// Negate the even results. The results in z6 should now match the results
// computed earlier in z5.
// ... 7 6 5 4 3 2 1 0 <-- element
// ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
__ Fneg(z6.VnH(), p2.Merging(), z6.VnH());
// Similarly, but for wider elements.
__ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
__ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
__ Index(z1.VnS(), -4, 3);
__ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
__ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
__ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
__ Fdup(z0.VnS(), 20.0);
__ Fdup(z2.VnS(), 21.0);
__ Sel(z3.VnS(), p2, z0.VnS(), z1.VnS());
__ Sel(z4.VnS(), p2, z1.VnS(), z2.VnS());
__ Punpklo(p2.VnH(), p2.VnB());
__ Punpklo(p3.VnH(), p3.VnB());
__ Dup(z7.VnS(), 0);
__ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 0);
__ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 90);
__ Ext(z7.VnB(), z7.VnB(), z7.VnB(), 8);
__ Dup(z8.VnS(), 0);
__ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 180);
__ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 270);
__ Fneg(z8.VnS(), p2.Merging(), z8.VnS());
// Double precision computed for even lanes only.
__ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
__ Index(z1.VnD(), -4, 3);
__ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
__ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
__ Scvtf(z1.VnD(), p0.Merging(), z1.VnD());
__ Fdup(z0.VnD(), 20.0);
__ Fdup(z2.VnD(), 21.0);
__ Sel(z3.VnD(), p2, z0.VnD(), z1.VnD());
__ Sel(z4.VnD(), p2, z1.VnD(), z2.VnD());
__ Punpklo(p2.VnH(), p2.VnB());
__ Dup(z9.VnD(), 0);
__ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 0);
__ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 90);
__ Dup(z10.VnD(), 0);
__ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 180);
__ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 270);
__ Fneg(z10.VnD(), p2.Merging(), z10.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z5.VnH(), z6.VnH());
ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
ASSERT_EQUAL_SVE(z9.VnD(), z10.VnD());
}
}
// Create a pattern in dst where the value of each element in src is incremented
// by the segment number. This allows varying a short input by a predictable
// pattern for each segment.
static void FPSegmentPatternHelper(MacroAssembler* masm,
const ZRegister& dst,
const PRegisterM& ptrue,
const ZRegister& src) {
VIXL_ASSERT(AreSameLaneSize(dst, src));
UseScratchRegisterScope temps(masm);
ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
masm->Index(ztmp, 0, 1);
masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
masm->Scvtf(ztmp, ptrue, ztmp);
masm->Fadd(dst, src, ztmp);
}
TEST_SVE(sve_fpmul_index) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in0[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
uint64_t in1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
__ Ptrue(p0.VnB());
// Repeat indexed vector across up to 2048-bit VL.
for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i++) {
InsrHelper(&masm, z25.VnD(), in0);
}
InsrHelper(&masm, z1.VnD(), in1);
FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z25.VnH());
__ Fmul(z2.VnH(), z1.VnH(), z0.VnH(), 0);
__ Fmul(z3.VnH(), z1.VnH(), z0.VnH(), 1);
__ Fmul(z4.VnH(), z1.VnH(), z0.VnH(), 4);
__ Fmul(z5.VnH(), z1.VnH(), z0.VnH(), 7);
__ Fmul(z6.VnS(), z1.VnS(), z0.VnS(), 0);
__ Fmul(z7.VnS(), z1.VnS(), z0.VnS(), 1);
__ Fmul(z8.VnS(), z1.VnS(), z0.VnS(), 2);
__ Fmul(z9.VnS(), z1.VnS(), z0.VnS(), 3);
__ Fmul(z10.VnD(), z1.VnD(), z0.VnD(), 0);
__ Fmul(z11.VnD(), z1.VnD(), z0.VnD(), 1);
// Compute the results using other instructions.
__ Dup(z12.VnH(), z25.VnH(), 0);
FPSegmentPatternHelper(&masm, z12.VnH(), p0.Merging(), z12.VnH());
__ Fmul(z12.VnH(), z1.VnH(), z12.VnH());
__ Dup(z13.VnH(), z25.VnH(), 1);
FPSegmentPatternHelper(&masm, z13.VnH(), p0.Merging(), z13.VnH());
__ Fmul(z13.VnH(), z1.VnH(), z13.VnH());
__ Dup(z14.VnH(), z25.VnH(), 4);
FPSegmentPatternHelper(&masm, z14.VnH(), p0.Merging(), z14.VnH());
__ Fmul(z14.VnH(), z1.VnH(), z14.VnH());
__ Dup(z15.VnH(), z25.VnH(), 7);
FPSegmentPatternHelper(&masm, z15.VnH(), p0.Merging(), z15.VnH());
__ Fmul(z15.VnH(), z1.VnH(), z15.VnH());
__ Dup(z16.VnS(), z25.VnS(), 0);
FPSegmentPatternHelper(&masm, z16.VnH(), p0.Merging(), z16.VnH());
__ Fmul(z16.VnS(), z1.VnS(), z16.VnS());
__ Dup(z17.VnS(), z25.VnS(), 1);
FPSegmentPatternHelper(&masm, z17.VnH(), p0.Merging(), z17.VnH());
__ Fmul(z17.VnS(), z1.VnS(), z17.VnS());
__ Dup(z18.VnS(), z25.VnS(), 2);
FPSegmentPatternHelper(&masm, z18.VnH(), p0.Merging(), z18.VnH());
__ Fmul(z18.VnS(), z1.VnS(), z18.VnS());
__ Dup(z19.VnS(), z25.VnS(), 3);
FPSegmentPatternHelper(&masm, z19.VnH(), p0.Merging(), z19.VnH());
__ Fmul(z19.VnS(), z1.VnS(), z19.VnS());
__ Dup(z20.VnD(), z25.VnD(), 0);
FPSegmentPatternHelper(&masm, z20.VnH(), p0.Merging(), z20.VnH());
__ Fmul(z20.VnD(), z1.VnD(), z20.VnD());
__ Dup(z21.VnD(), z25.VnD(), 1);
FPSegmentPatternHelper(&masm, z21.VnH(), p0.Merging(), z21.VnH());
__ Fmul(z21.VnD(), z1.VnD(), z21.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z12.VnH(), z2.VnH());
ASSERT_EQUAL_SVE(z13.VnH(), z3.VnH());
ASSERT_EQUAL_SVE(z14.VnH(), z4.VnH());
ASSERT_EQUAL_SVE(z15.VnH(), z5.VnH());
ASSERT_EQUAL_SVE(z16.VnS(), z6.VnS());
ASSERT_EQUAL_SVE(z17.VnS(), z7.VnS());
ASSERT_EQUAL_SVE(z18.VnS(), z8.VnS());
ASSERT_EQUAL_SVE(z19.VnS(), z9.VnS());
ASSERT_EQUAL_SVE(z20.VnD(), z10.VnD());
ASSERT_EQUAL_SVE(z21.VnD(), z11.VnD());
}
}
TEST_SVE(sve_ftmad) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t in_h0[] = {0x7c027e01fc02fe01,
0x3c003c00bc00bc00,
0x3c003c00bc00bc00};
uint64_t in_h1[] = {0xfe01fc027e017e01,
0x3c00bc003c00bc00,
0x3c00bc003c00bc00};
uint64_t in_s0[] = {0x7f800002ffc00001,
0x3f8000003f800000,
0xbf800000bf800000};
uint64_t in_s1[] = {0xffc00001ffc00001,
0x3f800000bf800000,
0x3f800000bf800000};
uint64_t in_d0[] = {0x7ff8000000000001,
0x3ff0000000000000,
0xbff0000000000000};
uint64_t in_d1[] = {0xfff0000000000002,
0xbff0000000000000,
0x3ff0000000000000};
InsrHelper(&masm, z0.VnD(), in_h0);
InsrHelper(&masm, z1.VnD(), in_h1);
InsrHelper(&masm, z2.VnD(), in_s0);
InsrHelper(&masm, z3.VnD(), in_s1);
InsrHelper(&masm, z4.VnD(), in_d0);
InsrHelper(&masm, z5.VnD(), in_d1);
__ Mov(z6, z0);
__ Ftmad(z6.VnH(), z6.VnH(), z1.VnH(), 0);
__ Mov(z7, z0);
__ Ftmad(z7.VnH(), z7.VnH(), z1.VnH(), 1);
__ Mov(z8, z0);
__ Ftmad(z8.VnH(), z8.VnH(), z1.VnH(), 2);
__ Mov(z9, z2);
__ Ftmad(z9.VnS(), z9.VnS(), z3.VnS(), 0);
__ Mov(z10, z2);
__ Ftmad(z10.VnS(), z10.VnS(), z3.VnS(), 3);
__ Mov(z11, z2);
__ Ftmad(z11.VnS(), z11.VnS(), z3.VnS(), 4);
__ Mov(z12, z4);
__ Ftmad(z12.VnD(), z12.VnD(), z5.VnD(), 0);
__ Mov(z13, z4);
__ Ftmad(z13.VnD(), z13.VnD(), z5.VnD(), 5);
__ Mov(z14, z4);
__ Ftmad(z14.VnD(), z14.VnD(), z5.VnD(), 7);
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z6[] = {0x7e027e02fe02fe01,
0x4000400000000000,
0x4000400000000000};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0x7e027e02fe02fe01,
0x3aab3800bcabbe00,
0x3aab3800bcabbe00};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0x7e027e02fe02fe01,
0x3c083c2abbefbbac,
0x3c083c2abbefbbac};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0x7fc00002ffc00001,
0x4000000040000000,
0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] = {0x7fc00002ffc00001,
0x3f7ff2ff3f7fa4fc,
0xbf800680bf802d82};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
uint64_t expected_z11[] = {0x7fc00002ffc00001,
0x3f8000173f8000cd,
0xbf7fffd2bf7ffe66};
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
uint64_t expected_z12[] = {0x7ff8000000000002,
0x4000000000000000,
0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
uint64_t expected_z13[] = {0x7ff8000000000002,
0x3fefffff6c0d846c,
0xbff0000006b978ae};
ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
uint64_t expected_z14[] = {0x7ff8000000000002,
0x3feffffffffe708a,
0xbff0000000000000};
ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
}
}
static void BasicFPArithHelper(MacroAssembler* masm,
int lane_size_in_bits,
const uint64_t (&inputs)[2],
const uint64_t (&inputs_fmulx)[2],
const uint64_t (&inputs_nans)[2]) {
int ls = lane_size_in_bits;
for (int i = 0; i < 16; i++) {
InsrHelper(masm, z0.VnD(), inputs);
}
ZRegister rvrs = z1.WithLaneSize(ls);
masm->Rev(rvrs, z0.WithLaneSize(ls));
int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
Initialise(masm, p2.VnB(), pred);
PRegisterM p2m = p2.Merging();
masm->Mov(z2, z0);
masm->Fadd(z2.WithLaneSize(ls),
p2m,
z2.WithLaneSize(ls),
rvrs,
FastNaNPropagation);
masm->Mov(z3, z0);
masm->Fsub(z3.WithLaneSize(ls), p2m, z3.WithLaneSize(ls), rvrs);
masm->Mov(z4, z0);
masm->Fsub(z4.WithLaneSize(ls), p2m, rvrs, z4.WithLaneSize(ls));
masm->Mov(z5, z0);
masm->Fabd(z5.WithLaneSize(ls),
p2m,
z5.WithLaneSize(ls),
rvrs,
FastNaNPropagation);
masm->Mov(z6, z0);
masm->Fmul(z6.WithLaneSize(ls),
p2m,
z6.WithLaneSize(ls),
rvrs,
FastNaNPropagation);
for (int i = 0; i < 16; i++) {
InsrHelper(masm, z7.VnD(), inputs_fmulx);
}
masm->Rev(z8.WithLaneSize(ls), z7.WithLaneSize(ls));
masm->Fmulx(z7.WithLaneSize(ls),
p2m,
z7.WithLaneSize(ls),
z8.WithLaneSize(ls),
FastNaNPropagation);
InsrHelper(masm, z8.VnD(), inputs_nans);
masm->Mov(z9, z8);
masm->Fminnm(z9.WithLaneSize(ls),
p2m,
z9.WithLaneSize(ls),
rvrs,
FastNaNPropagation);
masm->Mov(z10, z8);
masm->Fmaxnm(z10.WithLaneSize(ls),
p2m,
z10.WithLaneSize(ls),
rvrs,
FastNaNPropagation);
}
TEST_SVE(sve_fp_arith_pred_h) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t inputs[] = {0x4800470046004500, 0x4400420040003c00};
uint64_t inputs_fmulx[] = {0x7c00fc007c00fc00, 0x0000800000008000};
uint64_t inputs_nans[] = {0x7fffffff7fffffff, 0x7bfffbff7fbbfbff};
BasicFPArithHelper(&masm, kHRegSize, inputs, inputs_fmulx, inputs_nans);
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z2[] = {0x4880488048804880, 0x4880420048804880};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0x4700450042003c00, 0xbc004200c500c700};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0xc700c500c200bc00, 0x3c00420045004700};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x4700450042003c00, 0x3c00420045004700};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x48004b004c804d00, 0x4d0042004b004800};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0xc000c000c000c000, 0xc0008000c000c000};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z9[] = {0x3c00400042004400, 0x4500fbff4700fbff};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] = {0x3c00400042004400, 0x7bfffbff47004800};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
}
}
TEST_SVE(sve_fp_arith_pred_s) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t inputs[] = {0x4080000040400000, 0x400000003f800000};
uint64_t inputs_fmulx[] = {0x7f800000ff800000, 0x0000000080000000};
uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x41000000c1000000};
BasicFPArithHelper(&masm, kSRegSize, inputs, inputs_fmulx, inputs_nans);
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z2[] = {0x40a0000040a00000, 0x4000000040a00000};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0x404000003f800000, 0x40000000c0400000};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0xc0400000bf800000, 0x4000000040400000};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x404000003f800000, 0x4000000040400000};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x4080000040c00000, 0x4000000040800000};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0xc0000000c0000000, 0x00000000c0000000};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z9[] = {0x3f80000040000000, 0x41000000c1000000};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] = {0x3f80000040000000, 0x4100000040800000};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
}
}
TEST_SVE(sve_fp_arith_pred_d) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t inputs[] = {0x4000000000000000, 0x3ff0000000000000};
uint64_t inputs_fmulx[] = {0x7ff0000000000000, 0x8000000000000000};
uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x4100000000000000};
BasicFPArithHelper(&masm, kDRegSize, inputs, inputs_fmulx, inputs_nans);
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z2[] = {0x4008000000000000, 0x4008000000000000};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0x3ff0000000000000, 0xbff0000000000000};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0xbff0000000000000, 0x3ff0000000000000};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x3ff0000000000000, 0x3ff0000000000000};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x4000000000000000, 0x4000000000000000};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0xc000000000000000, 0xc000000000000000};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z9[] = {0x3ff0000000000000, 0x4000000000000000};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z10[] = {0x3ff0000000000000, 0x4100000000000000};
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
}
}
TEST_SVE(sve_fp_arith_pred_imm) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
Initialise(&masm, p0.VnB(), pred);
PRegisterM p0m = p0.Merging();
__ Ptrue(p1.VnB());
__ Fdup(z0.VnD(), 0.0);
__ Mov(z1, z0);
__ Fdiv(z1.VnH(), p1.Merging(), z1.VnH(), z1.VnH());
__ Mov(z2, z0);
__ Fadd(z2.VnH(), p0m, z2.VnH(), 0.5);
__ Mov(z3, z2);
__ Fsub(z3.VnH(), p0m, z3.VnH(), 1.0);
__ Mov(z4, z3);
__ Fsub(z4.VnH(), p0m, 1.0, z4.VnH());
__ Mov(z5, z4);
__ Fmul(z5.VnH(), p0m, z5.VnH(), 2.0);
__ Mov(z6, z1);
__ Fminnm(z6.VnH(), p0m, z6.VnH(), 0.0);
__ Mov(z7, z1);
__ Fmaxnm(z7.VnH(), p0m, z7.VnH(), 1.0);
__ Mov(z8, z5);
__ Fmin(z8.VnH(), p0m, z8.VnH(), 1.0);
__ Mov(z9, z5);
__ Fmax(z9.VnH(), p0m, z9.VnH(), 0.0);
__ Mov(z11, z0);
__ Fdiv(z11.VnS(), p1.Merging(), z11.VnS(), z11.VnS());
__ Mov(z12, z0);
__ Fadd(z12.VnS(), p0m, z12.VnS(), 0.5);
__ Mov(z13, z12);
__ Fsub(z13.VnS(), p0m, z13.VnS(), 1.0);
__ Mov(z14, z13);
__ Fsub(z14.VnS(), p0m, 1.0, z14.VnS());
__ Mov(z15, z14);
__ Fmul(z15.VnS(), p0m, z15.VnS(), 2.0);
__ Mov(z16, z11);
__ Fminnm(z16.VnS(), p0m, z16.VnS(), 0.0);
__ Mov(z17, z11);
__ Fmaxnm(z17.VnS(), p0m, z17.VnS(), 1.0);
__ Mov(z18, z15);
__ Fmin(z18.VnS(), p0m, z18.VnS(), 1.0);
__ Mov(z19, z15);
__ Fmax(z19.VnS(), p0m, z19.VnS(), 0.0);
__ Mov(z21, z0);
__ Fdiv(z21.VnD(), p1.Merging(), z21.VnD(), z21.VnD());
__ Mov(z22, z0);
__ Fadd(z22.VnD(), p0m, z22.VnD(), 0.5);
__ Mov(z23, z22);
__ Fsub(z23.VnD(), p0m, z23.VnD(), 1.0);
__ Mov(z24, z23);
__ Fsub(z24.VnD(), p0m, 1.0, z24.VnD());
__ Mov(z25, z24);
__ Fmul(z25.VnD(), p0m, z25.VnD(), 2.0);
__ Mov(z26, z21);
__ Fminnm(z26.VnD(), p0m, z26.VnD(), 0.0);
__ Mov(z27, z21);
__ Fmaxnm(z27.VnD(), p0m, z27.VnD(), 1.0);
__ Mov(z28, z25);
__ Fmin(z28.VnD(), p0m, z28.VnD(), 1.0);
__ Mov(z29, z25);
__ Fmax(z29.VnD(), p0m, z29.VnD(), 0.0);
__ Index(z0.VnH(), -3, 1);
__ Scvtf(z0.VnH(), p1.Merging(), z0.VnH());
__ Fmax(z0.VnH(), p1.Merging(), z0.VnH(), 0.0);
__ Index(z1.VnS(), -4, 2);
__ Scvtf(z1.VnS(), p1.Merging(), z1.VnS());
__ Fadd(z1.VnS(), p1.Merging(), z1.VnS(), 1.0);
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z2[] = {0x3800380038003800, 0x3800000038003800};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0xb800b800b800b800, 0xb8000000b800b800};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
uint64_t expected_z4[] = {0x3e003e003e003e00, 0x3e0000003e003e00};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x4200420042004200, 0x4200000042004200};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x0000000000000000, 0x00007e0000000000};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0x3c003c003c003c00, 0x3c007e003c003c00};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0x3c003c003c003c00, 0x3c0000003c003c00};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0x4200420042004200, 0x4200000042004200};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z12[] = {0x3f0000003f000000, 0x000000003f000000};
ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
uint64_t expected_z13[] = {0xbf000000bf000000, 0x00000000bf000000};
ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
uint64_t expected_z14[] = {0x3fc000003fc00000, 0x000000003fc00000};
ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
uint64_t expected_z15[] = {0x4040000040400000, 0x0000000040400000};
ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
uint64_t expected_z16[] = {0x0000000000000000, 0x7fc0000000000000};
ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
uint64_t expected_z17[] = {0x3f8000003f800000, 0x7fc000003f800000};
ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
uint64_t expected_z18[] = {0x3f8000003f800000, 0x000000003f800000};
ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
uint64_t expected_z19[] = {0x4040000040400000, 0x0000000040400000};
ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
uint64_t expected_z22[] = {0x3fe0000000000000, 0x3fe0000000000000};
ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
uint64_t expected_z23[] = {0xbfe0000000000000, 0xbfe0000000000000};
ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
uint64_t expected_z24[] = {0x3ff8000000000000, 0x3ff8000000000000};
ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
uint64_t expected_z25[] = {0x4008000000000000, 0x4008000000000000};
ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
uint64_t expected_z26[] = {0x0000000000000000, 0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
uint64_t expected_z27[] = {0x3ff0000000000000, 0x3ff0000000000000};
ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
uint64_t expected_z28[] = {0x3ff0000000000000, 0x3ff0000000000000};
ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
uint64_t expected_z29[] = {0x4008000000000000, 0x4008000000000000};
ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
uint64_t expected_z0[] = {0x4400420040003c00, 0x0000000000000000};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
uint64_t expected_z1[] = {0x404000003f800000, 0xbf800000c0400000};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
}
}
TEST_SVE(sve_fscale) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
uint64_t inputs_h[] = {0x4800470046004500, 0x4400420040003c00};
InsrHelper(&masm, z0.VnD(), inputs_h);
uint64_t inputs_s[] = {0x4080000040400000, 0x400000003f800000};
InsrHelper(&masm, z1.VnD(), inputs_s);
uint64_t inputs_d[] = {0x40f0000000000000, 0x4000000000000000};
InsrHelper(&masm, z2.VnD(), inputs_d);
uint64_t scales[] = {0x00080002fff8fffe, 0x00100001fff0ffff};
InsrHelper(&masm, z3.VnD(), scales);
__ Ptrue(p0.VnB());
int pred[] = {0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1};
Initialise(&masm, p1.VnB(), pred);
__ Mov(z4, z0);
__ Fscale(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH());
__ Mov(z5, z0);
__ Fscale(z5.VnH(), p1.Merging(), z5.VnH(), z3.VnH());
__ Sunpklo(z3.VnS(), z3.VnH());
__ Mov(z6, z1);
__ Fscale(z6.VnS(), p0.Merging(), z6.VnS(), z3.VnS());
__ Mov(z7, z1);
__ Fscale(z7.VnS(), p1.Merging(), z7.VnS(), z3.VnS());
__ Sunpklo(z3.VnD(), z3.VnS());
__ Mov(z8, z2);
__ Fscale(z8.VnD(), p0.Merging(), z8.VnD(), z3.VnD());
__ Mov(z9, z2);
__ Fscale(z9.VnD(), p1.Merging(), z9.VnD(), z3.VnD());
// Test full double precision range scaling.
__ Dup(z10.VnD(), 2045);
__ Dup(z11.VnD(), 0x0010000000000000); // 2^-1022
__ Fscale(z11.VnD(), p0.Merging(), z11.VnD(), z10.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z4[] = {0x68004f0026003d00, 0x7c00460002003800};
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
uint64_t expected_z5[] = {0x68004f0026004500, 0x7c00420002003800};
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
uint64_t expected_z6[] = {0x4880000040c00000, 0x380000003f000000};
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
uint64_t expected_z7[] = {0x4880000040400000, 0x400000003f000000};
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
uint64_t expected_z8[] = {0x3ff0000000000000, 0x3ff0000000000000};
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
uint64_t expected_z9[] = {0x40f0000000000000, 0x3ff0000000000000};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
uint64_t expected_z11[] = {0x7fe0000000000000, 0x7fe0000000000000};
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
}
}
typedef void (MacroAssembler::*FcvtFrintMFn)(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn);
typedef void (MacroAssembler::*FcvtFrintZFn)(const ZRegister& zd,
const PRegisterZ& pg,
const ZRegister& zn);
template <typename F, size_t N>
static void TestFcvtFrintHelper(Test* config,
FcvtFrintMFn macro_m,
FcvtFrintZFn macro_z,
int dst_type_size_in_bits,
int src_type_size_in_bits,
const F (&zn_inputs)[N],
const int (&pg_inputs)[N],
const uint64_t (&zd_expected_all_active)[N]) {
VIXL_ASSERT(macro_m != NULL);
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// If the input and result types have a different size, the instruction
// options on elements of the largest specified type is determined by the
// larger type.
int lane_size_in_bits =
std::max(dst_type_size_in_bits, src_type_size_in_bits);
ZRegister zd_all_active = z25;
ZRegister zd_merging = z26;
ZRegister zn = z27;
uint64_t zn_rawbits[N];
FPToRawbitsWithSize(zn_inputs, zn_rawbits, src_type_size_in_bits);
InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_rawbits);
PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
__ Ptrue(pg_all_active);
// Test floating-point conversions with all lanes activated.
(masm.*macro_m)(zd_all_active.WithLaneSize(dst_type_size_in_bits),
pg_all_active.Merging(),
zn.WithLaneSize(src_type_size_in_bits));
PRegisterWithLaneSize pg_merging = p1.WithLaneSize(lane_size_in_bits);
Initialise(&masm, pg_merging, pg_inputs);
__ Dup(zd_merging.VnD(), 0x0bad0bad0bad0bad);
// Use the same `zn` inputs to test floating-point conversions but partial
// lanes are set inactive.
(masm.*macro_m)(zd_merging.WithLaneSize(dst_type_size_in_bits),
pg_merging.Merging(),
zn.WithLaneSize(src_type_size_in_bits));
ZRegister zd_zeroing = z24;
PRegisterWithLaneSize pg_zeroing = p1.WithLaneSize(lane_size_in_bits);
Initialise(&masm, pg_zeroing, pg_inputs);
if (macro_z != NULL) {
__ Dup(zd_zeroing.VnD(), 0x0bad0bad0bad0bad);
(masm.*macro_z)(zd_zeroing.WithLaneSize(dst_type_size_in_bits),
pg_zeroing.Zeroing(),
zn.WithLaneSize(src_type_size_in_bits));
}
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zd_expected_all_active,
zd_all_active.WithLaneSize(lane_size_in_bits));
uint64_t zd_expected_merging[N];
for (unsigned i = 0; i < N; i++) {
zd_expected_merging[i] =
pg_inputs[i] ? zd_expected_all_active[i]
: 0x0bad0bad0bad0bad & GetUintMask(lane_size_in_bits);
}
ASSERT_EQUAL_SVE(zd_expected_merging,
zd_merging.WithLaneSize(lane_size_in_bits));
if (macro_z != NULL) {
uint64_t zd_expected_zeroing[N] = {0};
for (unsigned i = 0; i < N; i++) {
if (pg_inputs[i]) {
zd_expected_zeroing[i] = zd_expected_all_active[i];
}
}
ASSERT_EQUAL_SVE(zd_expected_zeroing,
zd_zeroing.WithLaneSize(lane_size_in_bits));
}
}
}
template <typename F, size_t N>
static void TestFcvtzHelper(Test* config,
FcvtFrintMFn macro_m,
int dst_type_size_in_bits,
int src_type_size_in_bits,
const F (&zn_inputs)[N],
const int (&pg_inputs)[N],
const uint64_t (&zd_expected_all_active)[N]) {
TestFcvtFrintHelper(config,
macro_m,
// Fcvt variants have no zeroing predication form.
NULL,
dst_type_size_in_bits,
src_type_size_in_bits,
zn_inputs,
pg_inputs,
zd_expected_all_active);
}
TEST_SVE(fcvtzs_fcvtzu_float16) {
const double h_max_float16 = 0x7ff0; // Largest float16 == INT16_MAX.
const double h_min_float16 = -h_max_float16; // Smallest float16 > INT16_MIN.
const double largest_float16 = 0xffe0; // 65504
const double smallest_float16 = -largest_float16;
const double h_max_int_add_one = 0x8000;
double zn_inputs[] = {1.0,
1.1,
1.5,
-1.5,
h_max_float16,
h_min_float16,
largest_float16,
smallest_float16,
kFP64PositiveInfinity,
kFP64NegativeInfinity,
h_max_int_add_one};
int pg_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1};
uint64_t expected_fcvtzs_fp162h[] =
{1, 1, 1, 0xffff, 0x7ff0, 0x8010, 0x7fff, 0x8000, 0x7fff, 0x8000, 0x7fff};
uint64_t expected_fcvtzu_fp162h[] =
{1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffff, 0, 0x8000};
// Float16 to 16-bit integers.
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzs,
kHRegSize,
kHRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzs_fp162h);
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzu,
kHRegSize,
kHRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzu_fp162h);
uint64_t expected_fcvtzs_fp162w[] = {1,
1,
1,
0xffffffff,
0x7ff0,
0xffff8010,
0xffe0,
0xffff0020,
0x7fffffff,
0x80000000,
0x8000};
uint64_t expected_fcvtzu_fp162w[] =
{1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffff, 0, 0x8000};
// Float16 to 32-bit integers.
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzs,
kSRegSize,
kHRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzs_fp162w);
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzu,
kSRegSize,
kHRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzu_fp162w);
uint64_t expected_fcvtzs_fp162x[] = {1,
1,
1,
0xffffffffffffffff,
0x7ff0,
0xffffffffffff8010,
0xffe0,
0xffffffffffff0020,
0x7fffffffffffffff,
0x8000000000000000,
0x8000};
uint64_t expected_fcvtzu_fp162x[] =
{1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffffffffffff, 0, 0x8000};
// Float16 to 64-bit integers.
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzs,
kDRegSize,
kHRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzs_fp162x);
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzu,
kDRegSize,
kHRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzu_fp162x);
}
TEST_SVE(fcvtzs_fcvtzu_float) {
const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
const double w_min_int_add_one = 0x80000000;
const double x_max_int_add_one = 0x80000000'00000000;
double zn_inputs[] = {1.0,
1.1,
1.5,
-1.5,
w_max_float,
w_min_float,
x_max_float,
x_min_float,
kFP64PositiveInfinity,
kFP64NegativeInfinity,
w_min_int_add_one,
x_max_int_add_one};
int pg_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1};
uint64_t expected_fcvtzs_s2w[] = {1,
1,
1,
0xffffffff,
0x7fffff80,
0x80000080,
0x7fffffff,
0x80000000,
0x7fffffff,
0x80000000,
0x7fffffff,
0x7fffffff};
uint64_t expected_fcvtzu_s2w[] = {1,
1,
1,
0,
0x7fffff80,
0,
0xffffffff,
0,
0xffffffff,
0,
0x80000000,
0xffffffff};
// Float to 32-bit integers.
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzs,
kSRegSize,
kSRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzs_s2w);
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzu,
kSRegSize,
kSRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzu_s2w);
uint64_t expected_fcvtzs_s2x[] = {1,
1,
1,
0xffffffffffffffff,
0x7fffff80,
0xffffffff80000080,
0x7fffff8000000000,
0x8000008000000000,
0x7fffffffffffffff,
0x8000000000000000,
0x80000000,
0x7fffffffffffffff};
uint64_t expected_fcvtzu_s2x[] = {1,
1,
1,
0,
0x7fffff80,
0,
0x7fffff8000000000,
0,
0xffffffffffffffff,
0,
0x80000000,
0x8000000000000000};
// Float to 64-bit integers.
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzs,
kDRegSize,
kSRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzs_s2x);
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzu,
kDRegSize,
kSRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzu_s2x);
}
TEST_SVE(fcvtzs_fcvtzu_double) {
const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
const double w_max_double = kWMaxInt; // Largest double == INT32_MAX.
const double w_min_double = -w_max_double; // Smallest double > INT32_MIN.
const double x_max_double =
0x7ffffffffffffc00; // Largest double < INT64_MAX.
const double x_min_double = -x_max_double; // Smallest double > INT64_MIN.
const double w_max_int_sub_one = kWMaxInt - 1;
const double w_min_int_add_one = kWMinInt + 1;
const double w_max_int_add_one = 0x80000000;
const double x_max_int_add_one = 0x80000000'00000000;
double zn_inputs[] = {1.0,
1.1,
1.5,
-1.5,
w_max_float,
w_min_float,
x_max_float,
x_min_float,
w_max_double,
w_min_double,
x_max_double,
x_min_double,
kFP64PositiveInfinity,
kFP64NegativeInfinity,
w_max_int_sub_one,
w_min_int_add_one,
w_max_int_add_one,
x_max_int_add_one};
int pg_inputs[] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0};
uint64_t expected_fcvtzs_d2w[] = {1,
1,
1,
0xffffffffffffffff,
0x7fffff80,
0xffffffff80000080,
0x7fffffff,
0xffffffff80000000,
0x7fffffff,
0xffffffff80000001,
0x7fffffff,
0xffffffff80000000,
0x7fffffff,
0xffffffff80000000,
0x7ffffffe,
0xffffffff80000001,
0x7fffffff,
0x7fffffff};
uint64_t expected_fcvtzu_d2w[] = {1,
1,
1,
0,
0x7fffff80,
0,
0xffffffff,
0,
0x7fffffff,
0,
0xffffffff,
0,
0xffffffff,
0,
0x7ffffffe,
0,
0x80000000,
0xffffffff};
// Double to 32-bit integers.
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzs,
kSRegSize,
kDRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzs_d2w);
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzu,
kSRegSize,
kDRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzu_d2w);
uint64_t expected_fcvtzs_d2x[] = {1,
1,
1,
0xffffffffffffffff,
0x7fffff80,
0xffffffff80000080,
0x7fffff8000000000,
0x8000008000000000,
0x7fffffff,
0xffffffff80000001,
0x7ffffffffffffc00,
0x8000000000000400,
0x7fffffffffffffff,
0x8000000000000000,
0x7ffffffe,
0xffffffff80000001,
0x80000000,
0x7fffffffffffffff};
uint64_t expected_fcvtzu_d2x[] = {1,
1,
1,
0,
0x7fffff80,
0,
0x7fffff8000000000,
0,
0x7fffffff,
0,
0x7ffffffffffffc00,
0,
0xffffffffffffffff,
0,
0x000000007ffffffe,
0,
0x80000000,
0x8000000000000000};
// Double to 64-bit integers.
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzs,
kDRegSize,
kDRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzs_d2x);
TestFcvtzHelper(config,
&MacroAssembler::Fcvtzu,
kDRegSize,
kDRegSize,
zn_inputs,
pg_inputs,
expected_fcvtzu_d2x);
}
template <typename F, size_t N>
static void TestFrintHelper(Test* config,
FcvtFrintMFn macro_m,
FcvtFrintZFn macro_z,
int lane_size_in_bits,
const F (&zn_inputs)[N],
const int (&pg_inputs)[N],
const F (&zd_expected)[N]) {
uint64_t zd_expected_rawbits[N];
FPToRawbitsWithSize(zd_expected, zd_expected_rawbits, lane_size_in_bits);
TestFcvtFrintHelper(config,
macro_m,
macro_z,
lane_size_in_bits,
lane_size_in_bits,
zn_inputs,
pg_inputs,
zd_expected_rawbits);
}
TEST_SVE(frint) {
const double inf_pos = kFP64PositiveInfinity;
const double inf_neg = kFP64NegativeInfinity;
double zn_inputs[] =
{1.1, 1.5, 1.9, 2.5, -1.5, -2.5, 0.0, -0.0, -0.2, inf_pos, inf_neg};
double zd_expected_a[] =
{1.0, 2.0, 2.0, 3.0, -2.0, -3.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
double zd_expected_i[] =
{1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
double zd_expected_m[] =
{1.0, 1.0, 1.0, 2.0, -2.0, -3.0, 0.0, -0.0, -1.0, inf_pos, inf_neg};
double zd_expected_n[] =
{1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
double zd_expected_p[] =
{2.0, 2.0, 2.0, 3.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
double zd_expected_x[] =
{1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
double zd_expected_z[] =
{1.0, 1.0, 1.0, 2.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
int pg_inputs[] = {0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0};
struct TestDataSet {
FcvtFrintMFn macro_m; // merging form.
FcvtFrintZFn macro_z; // zeroing form.
double (&expected)[11];
};
TestDataSet test_data[] =
{{&MacroAssembler::Frinta, &MacroAssembler::Frinta, zd_expected_a},
{&MacroAssembler::Frinti, &MacroAssembler::Frinti, zd_expected_i},
{&MacroAssembler::Frintm, &MacroAssembler::Frintm, zd_expected_m},
{&MacroAssembler::Frintn, &MacroAssembler::Frintn, zd_expected_n},
{&MacroAssembler::Frintp, &MacroAssembler::Frintp, zd_expected_p},
{&MacroAssembler::Frintx, &MacroAssembler::Frintx, zd_expected_x},
{&MacroAssembler::Frintz, &MacroAssembler::Frintz, zd_expected_z}};
unsigned lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
for (size_t i = 0; i < sizeof(test_data) / sizeof(TestDataSet); i++) {
for (size_t j = 0; j < ArrayLength(lane_sizes); j++) {
TestFrintHelper(config,
test_data[i].macro_m,
test_data[i].macro_z,
lane_sizes[j],
zn_inputs,
pg_inputs,
test_data[i].expected);
}
}
}
struct CvtfTestDataSet {
uint64_t int_value;
uint64_t scvtf_result;
uint64_t ucvtf_result;
};
template <size_t N>
static void TestUScvtfHelper(Test* config,
int dst_type_size_in_bits,
int src_type_size_in_bits,
const int (&pg_inputs)[N],
const CvtfTestDataSet (&data_set)[N]) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
// Unpack the data from the array of struct into individual arrays that can
// simplify the testing.
uint64_t zn_inputs[N];
uint64_t expected_zd_scvtf_all_active[N];
uint64_t expected_zd_ucvtf_all_active[N];
for (size_t i = 0; i < N; i++) {
zn_inputs[i] = data_set[i].int_value;
expected_zd_scvtf_all_active[i] = data_set[i].scvtf_result;
expected_zd_ucvtf_all_active[i] = data_set[i].ucvtf_result;
}
// If the input and result types have a different size, the instruction
// operates on elements of the largest specified type.
int lane_size_in_bits =
std::max(dst_type_size_in_bits, src_type_size_in_bits);
ZRegister zd_scvtf_all_active = z25;
ZRegister zd_ucvtf_all_active = z26;
ZRegister zn = z27;
InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_inputs);
PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
__ Ptrue(pg_all_active);
// Test integer conversions with all lanes activated.
__ Scvtf(zd_scvtf_all_active.WithLaneSize(dst_type_size_in_bits),
pg_all_active.Merging(),
zn.WithLaneSize(src_type_size_in_bits));
__ Ucvtf(zd_ucvtf_all_active.WithLaneSize(dst_type_size_in_bits),
pg_all_active.Merging(),
zn.WithLaneSize(src_type_size_in_bits));
ZRegister zd_scvtf_merged = z23;
ZRegister zd_ucvtf_merged = z24;
PRegisterWithLaneSize pg_merged = p1.WithLaneSize(lane_size_in_bits);
Initialise(&masm, pg_merged, pg_inputs);
uint64_t snan;
switch (lane_size_in_bits) {
case kHRegSize:
snan = 0x7c11;
break;
case kSRegSize:
snan = 0x7f951111;
break;
case kDRegSize:
snan = 0x7ff5555511111111;
break;
}
__ Dup(zd_scvtf_merged.WithLaneSize(lane_size_in_bits), snan);
__ Dup(zd_ucvtf_merged.WithLaneSize(lane_size_in_bits), snan);
// Use the same `zn` inputs to test integer conversions but some lanes are set
// inactive.
__ Scvtf(zd_scvtf_merged.WithLaneSize(dst_type_size_in_bits),
pg_merged.Merging(),
zn.WithLaneSize(src_type_size_in_bits));
__ Ucvtf(zd_ucvtf_merged.WithLaneSize(dst_type_size_in_bits),
pg_merged.Merging(),
zn.WithLaneSize(src_type_size_in_bits));
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(expected_zd_scvtf_all_active,
zd_scvtf_all_active.WithLaneSize(lane_size_in_bits));
ASSERT_EQUAL_SVE(expected_zd_ucvtf_all_active,
zd_ucvtf_all_active.WithLaneSize(lane_size_in_bits));
uint64_t expected_zd_scvtf_merged[N];
for (size_t i = 0; i < N; i++) {
expected_zd_scvtf_merged[i] =
pg_inputs[i] ? expected_zd_scvtf_all_active[i] : snan;
}
ASSERT_EQUAL_SVE(expected_zd_scvtf_merged,
zd_scvtf_merged.WithLaneSize(lane_size_in_bits));
uint64_t expected_zd_ucvtf_merged[N];
for (size_t i = 0; i < N; i++) {
expected_zd_ucvtf_merged[i] =
pg_inputs[i] ? expected_zd_ucvtf_all_active[i] : snan;
}
ASSERT_EQUAL_SVE(expected_zd_ucvtf_merged,
zd_ucvtf_merged.WithLaneSize(lane_size_in_bits));
}
}
TEST_SVE(scvtf_ucvtf_h_s_d_to_float16) {
// clang-format off
CvtfTestDataSet data_set_1[] = {
// Simple conversions of positive numbers which require no rounding; the
// results should not depend on the rounding mode, and ucvtf and scvtf should
// produce the same result.
{0x0000, 0x0000, 0x0000},
{0x0001, 0x3c00, 0x3c00},
{0x0010, 0x4c00, 0x4c00},
{0x0080, 0x5800, 0x5800},
{0x0400, 0x6400, 0x6400},
// Conversions which require rounding.
{0x4000, 0x7400, 0x7400},
{0x4001, 0x7400, 0x7400},
// Round up to produce a result that's too big for the input to represent.
{0x7ff0, 0x77ff, 0x77ff},
{0x7ff1, 0x77ff, 0x77ff},
{0x7ffe, 0x7800, 0x7800},
{0x7fff, 0x7800, 0x7800}};
int pg_1[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
TestUScvtfHelper(config, kHRegSize, kDRegSize, pg_1, data_set_1);
TestUScvtfHelper(config, kHRegSize, kSRegSize, pg_1, data_set_1);
TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_1, data_set_1);
CvtfTestDataSet data_set_2[] = {
// Test mantissa extremities.
{0x0401, 0x6401, 0x6401},
{0x4020, 0x7402, 0x7402},
// The largest int16_t that fits in a float16.
{0xffef, 0xcc40, 0x7bff},
// Values that would be negative if treated as an int16_t.
{0xff00, 0xdc00, 0x7bf8},
{0x8000, 0xf800, 0x7800},
{0x8100, 0xf7f0, 0x7808},
// Check for bit pattern reproduction.
{0x0123, 0x5c8c, 0x5c8c},
{0x0cde, 0x6a6f, 0x6a6f},
// Simple conversions of negative int64_t values. These require no rounding,
// and the results should not depend on the rounding mode.
{0xf800, 0xe800, 0x7bc0},
{0xfc00, 0xe400, 0x7be0},
{0xc000, 0xf400, 0x7a00},
// Check rounding of negative int16_t values.
{0x8ffe, 0xf700, 0x7880},
{0x8fff, 0xf700, 0x7880},
{0xffee, 0xcc80, 0x7bff},
{0xffef, 0xcc40, 0x7bff}};
int pg_2[] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1};
// `32-bit to float16` and `64-bit to float16` of above tests has been tested
// in `ucvtf` of `16-bit to float16`.
TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_2, data_set_2);
// clang-format on
}
TEST_SVE(scvtf_ucvtf_s_to_float) {
// clang-format off
int dst_lane_size = kSRegSize;
int src_lane_size = kSRegSize;
// Simple conversions of positive numbers which require no rounding; the
// results should not depend on the rounding mode, and ucvtf and scvtf should
// produce the same result.
CvtfTestDataSet data_set_1[] = {
{0x00000000, 0x00000000, 0x00000000},
{0x00000001, 0x3f800000, 0x3f800000},
{0x00004000, 0x46800000, 0x46800000},
{0x00010000, 0x47800000, 0x47800000},
{0x40000000, 0x4e800000, 0x4e800000}};
int pg_1[] = {1, 0, 1, 0, 0};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
CvtfTestDataSet data_set_2[] = {
// Test mantissa extremities.
{0x00800001, 0x4b000001, 0x4b000001},
{0x40400000, 0x4e808000, 0x4e808000},
// The largest int32_t that fits in a double.
{0x7fffff80, 0x4effffff, 0x4effffff},
// Values that would be negative if treated as an int32_t.
{0xffffffff, 0xbf800000, 0x4f800000},
{0xffffff00, 0xc3800000, 0x4f7fffff},
{0x80000000, 0xcf000000, 0x4f000000},
{0x80000001, 0xcf000000, 0x4f000000},
// Check for bit pattern reproduction.
{0x089abcde, 0x4d09abce, 0x4d09abce},
{0x12345678, 0x4d91a2b4, 0x4d91a2b4}};
int pg_2[] = {1, 0, 1, 0, 1, 1, 1, 0, 0};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
// Simple conversions of negative int32_t values. These require no rounding,
// and the results should not depend on the rounding mode.
CvtfTestDataSet data_set_3[] = {
{0xffffc000, 0xc6800000, 0x4f7fffc0},
{0xffff0000, 0xc7800000, 0x4f7fff00},
{0xc0000000, 0xce800000, 0x4f400000},
// Conversions which require rounding.
{0x72800000, 0x4ee50000, 0x4ee50000},
{0x72800001, 0x4ee50000, 0x4ee50000},
{0x73000000, 0x4ee60000, 0x4ee60000},
// Check rounding of negative int32_t values.
{0x80000140, 0xcefffffe, 0x4f000001},
{0x80000141, 0xcefffffd, 0x4f000001},
{0x80000180, 0xcefffffd, 0x4f000002},
// Round up to produce a result that's too big for the input to represent.
{0x7fffffc0, 0x4f000000, 0x4f000000},
{0x7fffffff, 0x4f000000, 0x4f000000}};
int pg_3[] = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
// clang-format on
}
TEST_SVE(scvtf_ucvtf_d_to_float) {
// clang-format off
int dst_lane_size = kSRegSize;
int src_lane_size = kDRegSize;
// Simple conversions of positive numbers which require no rounding; the
// results should not depend on the rounding mode, and ucvtf and scvtf should
// produce the same result.
CvtfTestDataSet data_set_1[] = {
{0x0000000000000000, 0x00000000, 0x00000000},
{0x0000000000000001, 0x3f800000, 0x3f800000},
{0x0000000040000000, 0x4e800000, 0x4e800000},
{0x0000000100000000, 0x4f800000, 0x4f800000},
{0x4000000000000000, 0x5e800000, 0x5e800000}};
int pg_1[] = {1, 1, 0, 1, 0};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
CvtfTestDataSet data_set_2[] = {
// Test mantissa extremities.
{0x0010000000000001, 0x59800000, 0x59800000},
{0x4008000000000000, 0x5e801000, 0x5e801000},
// The largest int32_t that fits in a float.
{0x000000007fffff80, 0x4effffff, 0x4effffff},
// Values that would be negative if treated as an int32_t.
{0x00000000ffffffff, 0x4f800000, 0x4f800000},
{0x00000000ffffff00, 0x4f7fffff, 0x4f7fffff},
{0x0000000080000000, 0x4f000000, 0x4f000000},
{0x0000000080000100, 0x4f000001, 0x4f000001},
// The largest int64_t that fits in a float.
{0x7fffff8000000000, 0x5effffff, 0x5effffff},
// Check for bit pattern reproduction.
{0x0123456789abcde0, 0x5b91a2b4, 0x5b91a2b4},
{0x0000000000876543, 0x4b076543, 0x4b076543}};
int pg_2[] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
CvtfTestDataSet data_set_3[] = {
// Simple conversions of negative int64_t values. These require no rounding,
// and the results should not depend on the rounding mode.
{0xffffffffc0000000, 0xce800000, 0x5f800000},
{0xffffffff00000000, 0xcf800000, 0x5f800000},
{0xc000000000000000, 0xde800000, 0x5f400000},
// Conversions which require rounding.
{0x0000800002800000, 0x57000002, 0x57000002},
{0x0000800002800001, 0x57000003, 0x57000003},
{0x0000800003000000, 0x57000003, 0x57000003},
// Check rounding of negative int64_t values.
{0x8000014000000000, 0xdefffffe, 0x5f000001},
{0x8000014000000001, 0xdefffffd, 0x5f000001},
{0x8000018000000000, 0xdefffffd, 0x5f000002},
// Round up to produce a result that's too big for the input to represent.
{0x00000000ffffff80, 0x4f800000, 0x4f800000},
{0x00000000ffffffff, 0x4f800000, 0x4f800000},
{0xffffff8000000000, 0xd3000000, 0x5f800000},
{0xffffffffffffffff, 0xbf800000, 0x5f800000}};
int pg_3[] = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
// clang-format on
}
TEST_SVE(scvtf_ucvtf_d_to_double) {
// clang-format off
int dst_lane_size = kDRegSize;
int src_lane_size = kDRegSize;
// Simple conversions of positive numbers which require no rounding; the
// results should not depend on the rounding mode, and ucvtf and scvtf should
// produce the same result.
CvtfTestDataSet data_set_1[] = {
{0x0000000000000000, 0x0000000000000000, 0x0000000000000000},
{0x0000000000000001, 0x3ff0000000000000, 0x3ff0000000000000},
{0x0000000040000000, 0x41d0000000000000, 0x41d0000000000000},
{0x0000000100000000, 0x41f0000000000000, 0x41f0000000000000},
{0x4000000000000000, 0x43d0000000000000, 0x43d0000000000000}};
int pg_1[] = {0, 1, 1, 0, 0};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
CvtfTestDataSet data_set_2[] = {
// Test mantissa extremities.
{0x0010000000000001, 0x4330000000000001, 0x4330000000000001},
{0x4008000000000000, 0x43d0020000000000, 0x43d0020000000000},
// The largest int32_t that fits in a double.
{0x000000007fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
// Values that would be negative if treated as an int32_t.
{0x00000000ffffffff, 0x41efffffffe00000, 0x41efffffffe00000},
{0x0000000080000000, 0x41e0000000000000, 0x41e0000000000000},
{0x0000000080000001, 0x41e0000000200000, 0x41e0000000200000},
// The largest int64_t that fits in a double.
{0x7ffffffffffffc00, 0x43dfffffffffffff, 0x43dfffffffffffff},
// Check for bit pattern reproduction.
{0x0123456789abcde0, 0x43723456789abcde, 0x43723456789abcde},
{0x0000000012345678, 0x41b2345678000000, 0x41b2345678000000}};
int pg_2[] = {1, 1, 1, 1, 1, 0, 0, 0, 0};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
CvtfTestDataSet data_set_3[] = {
// Simple conversions of negative int64_t values. These require no rounding,
// and the results should not depend on the rounding mode.
{0xffffffffc0000000, 0xc1d0000000000000, 0x43effffffff80000},
{0xffffffff00000000, 0xc1f0000000000000, 0x43efffffffe00000},
{0xc000000000000000, 0xc3d0000000000000, 0x43e8000000000000},
// Conversions which require rounding.
{0x1000000000000280, 0x43b0000000000002, 0x43b0000000000002},
{0x1000000000000281, 0x43b0000000000003, 0x43b0000000000003},
{0x1000000000000300, 0x43b0000000000003, 0x43b0000000000003},
// Check rounding of negative int64_t values.
{0x8000000000000a00, 0xc3dffffffffffffe, 0x43e0000000000001},
{0x8000000000000a01, 0xc3dffffffffffffd, 0x43e0000000000001},
{0x8000000000000c00, 0xc3dffffffffffffd, 0x43e0000000000002},
// Round up to produce a result that's too big for the input to represent.
{0x7ffffffffffffe00, 0x43e0000000000000, 0x43e0000000000000},
{0x7fffffffffffffff, 0x43e0000000000000, 0x43e0000000000000},
{0xfffffffffffffc00, 0xc090000000000000, 0x43f0000000000000},
{0xffffffffffffffff, 0xbff0000000000000, 0x43f0000000000000}};
int pg_3[] = {1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
// clang-format on
}
TEST_SVE(scvtf_ucvtf_s_to_double) {
// clang-format off
int dst_lane_size = kDRegSize;
int src_lane_size = kSRegSize;
// Simple conversions of positive numbers which require no rounding; the
// results should not depend on the rounding mode, and ucvtf and scvtf should
// produce the same result.
CvtfTestDataSet data_set_1[] = {
{0x00000000, 0x0000000000000000, 0x0000000000000000},
{0x00000001, 0x3ff0000000000000, 0x3ff0000000000000},
{0x00004000, 0x40d0000000000000, 0x40d0000000000000},
{0x00010000, 0x40f0000000000000, 0x40f0000000000000},
{0x40000000, 0x41d0000000000000, 0x41d0000000000000}};
int pg_1[] = {1, 0, 0, 0, 1};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
CvtfTestDataSet data_set_2[] = {
// Test mantissa extremities.
{0x40000400, 0x41d0000100000000, 0x41d0000100000000},
// The largest int32_t that fits in a double.
{0x7fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
// Values that would be negative if treated as an int32_t.
{0xffffffff, 0xbff0000000000000, 0x41efffffffe00000},
{0x80000000, 0xc1e0000000000000, 0x41e0000000000000},
{0x80000001, 0xc1dfffffffc00000, 0x41e0000000200000},
// Check for bit pattern reproduction.
{0x089abcde, 0x41a13579bc000000, 0x41a13579bc000000},
{0x12345678, 0x41b2345678000000, 0x41b2345678000000},
// Simple conversions of negative int32_t values. These require no rounding,
// and the results should not depend on the rounding mode.
{0xffffc000, 0xc0d0000000000000, 0x41effff800000000},
{0xffff0000, 0xc0f0000000000000, 0x41efffe000000000},
{0xc0000000, 0xc1d0000000000000, 0x41e8000000000000}};
int pg_2[] = {1, 0, 1, 0, 0, 1, 1, 0, 1, 1};
TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
// Note that IEEE 754 double-precision format has 52-bits fraction, so all
// 32-bits integers are representable in double.
// clang-format on
}
TEST_SVE(sve_fadda) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kFP);
START();
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
__ Zip1(p1.VnH(), p0.VnH(), p1.VnH());
__ Index(z0.VnS(), 3, 3);
__ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
__ Fmov(s2, 2.0);
__ Fadda(s2, p0, s2, z0.VnS());
__ Index(z0.VnD(), -7, -7);
__ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
__ Fmov(d3, 3.0);
__ Fadda(d3, p0, d3, z0.VnD());
__ Index(z0.VnH(), 1, 1);
__ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
__ Fmov(h4, 0);
__ Fadda(h4, p1, h4, z0.VnH());
END();
if (CAN_RUN()) {
RUN();
// Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
int n = core.GetSVELaneCount(kSRegSize);
ASSERT_EQUAL_FP32(2 + 3 * ((n + 1) * (n / 2)), s2);
n /= 2; // Half as many lanes.
ASSERT_EQUAL_FP64(3 + -7 * ((n + 1) * (n / 2)), d3);
// Sum of first n odd numbers is n^2.
n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
ASSERT_EQUAL_FP16(Float16(n * n), h4);
}
}
TEST_SVE(sve_extract) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Index(z0.VnB(), 0, 1);
__ Mov(z1, z0);
__ Mov(z2, z0);
__ Mov(z3, z0);
__ Mov(z4, z0);
__ Mov(z5, z0);
__ Mov(z6, z0);
__ Ext(z1, z1, z0, 0);
__ Ext(z2, z2, z0, 1);
__ Ext(z3, z3, z0, 15);
__ Ext(z4, z4, z0, 31);
__ Ext(z5, z5, z0, 47);
__ Ext(z6, z6, z0, 255);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z1, z0);
int lane_count = core.GetSVELaneCount(kBRegSize);
if (lane_count == 16) {
uint64_t z2_expected[] = {0x000f0e0d0c0b0a09, 0x0807060504030201};
ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
} else {
uint64_t z2_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
}
if (lane_count == 16) {
uint64_t z3_expected[] = {0x0e0d0c0b0a090807, 0x060504030201000f};
ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
} else {
uint64_t z3_expected[] = {0x1e1d1c1b1a191817, 0x161514131211100f};
ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
}
if (lane_count < 32) {
ASSERT_EQUAL_SVE(z4, z0);
} else if (lane_count == 32) {
uint64_t z4_expected[] = {0x0e0d0c0b0a090807, 0x060504030201001f};
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
} else {
uint64_t z4_expected[] = {0x2e2d2c2b2a292827, 0x262524232221201f};
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
}
if (lane_count < 48) {
ASSERT_EQUAL_SVE(z5, z0);
} else if (lane_count == 48) {
uint64_t z5_expected[] = {0x0e0d0c0b0a090807, 0x060504030201002f};
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
} else {
uint64_t z5_expected[] = {0x3e3d3c3b3a393837, 0x363534333231302f};
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
}
if (lane_count < 256) {
ASSERT_EQUAL_SVE(z6, z0);
} else {
uint64_t z6_expected[] = {0x0e0d0c0b0a090807, 0x06050403020100ff};
ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
}
}
}
TEST_SVE(sve_fp_paired_across) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Pfalse(p1.VnB());
__ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
__ Zip1(p3.VnD(), p0.VnD(), p1.VnD());
__ Zip1(p4.VnH(), p0.VnH(), p1.VnH());
__ Index(z0.VnS(), 3, 3);
__ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
__ Faddv(s1, p0, z0.VnS());
__ Fminv(s2, p2, z0.VnS());
__ Fmaxv(s3, p2, z0.VnS());
__ Index(z0.VnD(), -7, -7);
__ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
__ Faddv(d4, p0, z0.VnD());
__ Fminv(d5, p3, z0.VnD());
__ Fmaxv(d6, p3, z0.VnD());
__ Index(z0.VnH(), 1, 1);
__ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
__ Faddv(h7, p4, z0.VnH());
__ Fminv(h8, p4, z0.VnH());
__ Fmaxv(h9, p4, z0.VnH());
__ Dup(z10.VnH(), 0);
__ Fdiv(z10.VnH(), p0.Merging(), z10.VnH(), z10.VnH());
__ Insr(z10.VnH(), 0x5140);
__ Insr(z10.VnH(), 0xd140);
__ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 2);
__ Fmaxnmv(h11, p0, z10.VnH());
__ Fmaxnmv(h12, p4, z10.VnH());
__ Fminnmv(h13, p0, z10.VnH());
__ Fminnmv(h14, p4, z10.VnH());
__ Dup(z10.VnS(), 0);
__ Fdiv(z10.VnS(), p0.Merging(), z10.VnS(), z10.VnS());
__ Insr(z10.VnS(), 0x42280000);
__ Insr(z10.VnS(), 0xc2280000);
__ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 4);
__ Fmaxnmv(s15, p0, z10.VnS());
__ Fmaxnmv(s16, p2, z10.VnS());
__ Fminnmv(s17, p0, z10.VnS());
__ Fminnmv(s18, p2, z10.VnS());
__ Dup(z10.VnD(), 0);
__ Fdiv(z10.VnD(), p0.Merging(), z10.VnD(), z10.VnD());
__ Insr(z10.VnD(), 0x4045000000000000);
__ Insr(z10.VnD(), 0xc045000000000000);
__ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 8);
__ Fmaxnmv(d19, p0, z10.VnD());
__ Fmaxnmv(d20, p3, z10.VnD());
__ Fminnmv(d21, p0, z10.VnD());
__ Fminnmv(d22, p3, z10.VnD());
END();
if (CAN_RUN()) {
RUN();
// Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
int n = core.GetSVELaneCount(kSRegSize);
ASSERT_EQUAL_FP32(3 * ((n + 1) * (n / 2)), s1);
ASSERT_EQUAL_FP32(3, s2);
ASSERT_EQUAL_FP32(3 * n - 3, s3);
n /= 2; // Half as many lanes.
ASSERT_EQUAL_FP64(-7 * ((n + 1) * (n / 2)), d4);
ASSERT_EQUAL_FP64(-7 * (n - 1), d5);
ASSERT_EQUAL_FP64(-7, d6);
// Sum of first n odd numbers is n^2.
n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
ASSERT_EQUAL_FP16(Float16(n * n), h7);
ASSERT_EQUAL_FP16(Float16(1), h8);
n = core.GetSVELaneCount(kHRegSize);
ASSERT_EQUAL_FP16(Float16(n - 1), h9);
ASSERT_EQUAL_FP16(Float16(42), h11);
ASSERT_EQUAL_FP16(Float16(42), h12);
ASSERT_EQUAL_FP16(Float16(-42), h13);
ASSERT_EQUAL_FP16(Float16(42), h14);
ASSERT_EQUAL_FP32(42, s15);
ASSERT_EQUAL_FP32(42, s16);
ASSERT_EQUAL_FP32(-42, s17);
ASSERT_EQUAL_FP32(42, s18);
ASSERT_EQUAL_FP64(42, d19);
ASSERT_EQUAL_FP64(42, d20);
ASSERT_EQUAL_FP64(-42, d21);
ASSERT_EQUAL_FP64(42, d22);
}
}
TEST_SVE(sve_frecpe_frsqrte) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Index(z0.VnH(), 0, 1);
__ Fdup(z1.VnH(), Float16(1));
__ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
__ Insr(z1.VnH(), 0);
__ Frsqrte(z2.VnH(), z1.VnH());
__ Frecpe(z1.VnH(), z1.VnH());
__ Index(z0.VnS(), 0, 1);
__ Fdup(z3.VnS(), Float16(1));
__ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
__ Insr(z3.VnS(), 0);
__ Frsqrte(z4.VnS(), z3.VnS());
__ Frecpe(z3.VnS(), z3.VnS());
__ Index(z0.VnD(), 0, 1);
__ Fdup(z5.VnD(), Float16(1));
__ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
__ Insr(z5.VnD(), 0);
__ Frsqrte(z6.VnD(), z5.VnD());
__ Frecpe(z5.VnD(), z5.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t z1_expected[] = {0x23fc27fc2bfc2ffc, 0x33fc37fc3bfc7c00};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
uint64_t z2_expected[] = {0x2ffc31a433fc35a4, 0x37fc39a43bfc7c00};
ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
uint64_t z3_expected[] = {0x3e7f80003eff8000, 0x3f7f80007f800000};
ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
uint64_t z4_expected[] = {0x3eff80003f348000, 0x3f7f80007f800000};
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
uint64_t z5_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
uint64_t z6_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
}
}
TEST_SVE(sve_frecps_frsqrts) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Index(z0.VnH(), 0, -1);
__ Fdup(z1.VnH(), Float16(1));
__ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
__ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
__ Insr(z1.VnH(), 0);
__ Frsqrts(z2.VnH(), z1.VnH(), z0.VnH());
__ Frecps(z1.VnH(), z1.VnH(), z0.VnH());
__ Index(z0.VnS(), 0, -1);
__ Fdup(z3.VnS(), Float16(1));
__ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
__ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
__ Insr(z3.VnS(), 0);
__ Frsqrts(z4.VnS(), z3.VnS(), z0.VnS());
__ Frecps(z3.VnS(), z3.VnS(), z0.VnS());
__ Index(z0.VnD(), 0, -1);
__ Fdup(z5.VnD(), Float16(1));
__ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
__ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
__ Insr(z5.VnD(), 0);
__ Frsqrts(z6.VnD(), z5.VnD(), z0.VnD());
__ Frecps(z5.VnD(), z5.VnD(), z0.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t z1_expected[] = {0x4038406040a04100, 0x4180420042004000};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
uint64_t z2_expected[] = {0x3e383e603ea03f00, 0x3f80400040003e00};
ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
uint64_t z3_expected[] = {0x4030000040400000, 0x4040000040000000};
ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
uint64_t z4_expected[] = {0x3ff0000040000000, 0x400000003fc00000};
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
uint64_t z5_expected[] = {0x4008000000000000, 0x4000000000000000};
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
uint64_t z6_expected[] = {0x4000000000000000, 0x3ff8000000000000};
ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
}
}
TEST_SVE(sve_ftsmul) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
__ Index(z0.VnH(), 0, 1);
__ Rev(z1.VnH(), z0.VnH());
__ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
__ Dup(z2.VnH(), 0);
__ Fdiv(z2.VnH(), p0.Merging(), z2.VnH(), z2.VnH());
__ Ftsmul(z3.VnH(), z0.VnH(), z1.VnH());
__ Ftsmul(z4.VnH(), z2.VnH(), z1.VnH());
__ Index(z0.VnS(), -7, 1);
__ Rev(z1.VnS(), z0.VnS());
__ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
__ Dup(z2.VnS(), 0);
__ Fdiv(z2.VnS(), p0.Merging(), z2.VnS(), z2.VnS());
__ Ftsmul(z5.VnS(), z0.VnS(), z1.VnS());
__ Ftsmul(z6.VnS(), z2.VnS(), z1.VnS());
__ Index(z0.VnD(), 2, -1);
__ Rev(z1.VnD(), z0.VnD());
__ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
__ Dup(z2.VnD(), 0);
__ Fdiv(z2.VnD(), p0.Merging(), z2.VnD(), z2.VnD());
__ Ftsmul(z7.VnD(), z0.VnD(), z1.VnD());
__ Ftsmul(z8.VnD(), z2.VnD(), z1.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t z3_expected[] = {0x5220d0804e40cc00, 0x4880c4003c008000};
ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
uint64_t z4_expected[] = {0x7e007e007e007e00, 0x7e007e007e007e00};
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
uint64_t z5_expected[] = {0xc180000041c80000, 0xc210000042440000};
ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
uint64_t z6_expected[] = {0x7fc000007fc00000, 0x7fc000007fc00000};
ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
uint64_t z7_expected[] = {0x3ff0000000000000, 0xc010000000000000};
ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
uint64_t z8_expected[] = {0x7ff8000000000000, 0x7ff8000000000000};
ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
}
}
typedef void (MacroAssembler::*FPMulAccFn)(
const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
FPMacroNaNPropagationOption nan_option);
// The `pg_inputs` is used for examining the predication correctness internally.
// It does not imply the value of `result` argument. `result` stands for the
// expected result on all-true predication.
template <typename T, size_t N>
static void FPMulAccHelper(
Test* config,
FPMulAccFn macro,
unsigned lane_size_in_bits,
const int (&pg_inputs)[N],
const T (&za_inputs)[N],
const T (&zn_inputs)[N],
const T (&zm_inputs)[N],
const uint64_t (&result)[N],
FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
ZRegister za = z1.WithLaneSize(lane_size_in_bits);
ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
uint64_t za_rawbits[N];
uint64_t zn_rawbits[N];
uint64_t zm_rawbits[N];
FPToRawbitsWithSize(za_inputs, za_rawbits, lane_size_in_bits);
FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
InsrHelper(&masm, za, za_rawbits);
InsrHelper(&masm, zn, zn_rawbits);
InsrHelper(&masm, zm, zm_rawbits);
// Initialize `zd` with a signalling NaN.
uint64_t sn = GetSignallingNan(lane_size_in_bits);
__ Mov(x29, sn);
__ Dup(zd, x29);
Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
// Fmla macro automatically selects between fmla, fmad and movprfx + fmla
// Fmls `ditto` fmls, fmsb and movprfx + fmls
// Fnmla `ditto` fnmla, fnmad and movprfx + fnmla
// Fnmls `ditto` fnmls, fnmsb and movprfx + fnmls
// based on what registers are aliased.
ZRegister da_result = z10.WithLaneSize(lane_size_in_bits);
ZRegister dn_result = z11.WithLaneSize(lane_size_in_bits);
ZRegister dm_result = z12.WithLaneSize(lane_size_in_bits);
ZRegister d_result = z13.WithLaneSize(lane_size_in_bits);
__ Mov(da_result, za);
(masm.*macro)(da_result, p0.Merging(), da_result, zn, zm, nan_option);
__ Mov(dn_result, zn);
(masm.*macro)(dn_result, p0.Merging(), za, dn_result, zm, nan_option);
__ Mov(dm_result, zm);
(masm.*macro)(dm_result, p0.Merging(), za, zn, dm_result, nan_option);
__ Mov(d_result, zd);
(masm.*macro)(d_result, p0.Merging(), za, zn, zm, nan_option);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(za_rawbits, za);
ASSERT_EQUAL_SVE(zn_rawbits, zn);
ASSERT_EQUAL_SVE(zm_rawbits, zm);
uint64_t da_expected[N];
uint64_t dn_expected[N];
uint64_t dm_expected[N];
uint64_t d_expected[N];
for (size_t i = 0; i < N; i++) {
da_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : za_rawbits[i];
dn_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zn_rawbits[i];
dm_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zm_rawbits[i];
d_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : sn;
}
ASSERT_EQUAL_SVE(da_expected, da_result);
ASSERT_EQUAL_SVE(dn_expected, dn_result);
ASSERT_EQUAL_SVE(dm_expected, dm_result);
ASSERT_EQUAL_SVE(d_expected, d_result);
}
}
TEST_SVE(sve_fmla_fmad) {
// fmla : zd = za + zn * zm
double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
int pg_inputs[] = {1, 1, 0, 1};
uint64_t fmla_result_h[] = {Float16ToRawbits(Float16(-84.0)),
Float16ToRawbits(Float16(101.0)),
Float16ToRawbits(Float16(33.0)),
Float16ToRawbits(Float16(42.0))};
// `fmad` has been tested in the helper.
FPMulAccHelper(config,
&MacroAssembler::Fmla,
kHRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fmla_result_h);
uint64_t fmla_result_s[] = {FloatToRawbits(-84.0f),
FloatToRawbits(101.0f),
FloatToRawbits(33.0f),
FloatToRawbits(42.0f)};
FPMulAccHelper(config,
&MacroAssembler::Fmla,
kSRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fmla_result_s);
uint64_t fmla_result_d[] = {DoubleToRawbits(-84.0),
DoubleToRawbits(101.0),
DoubleToRawbits(33.0),
DoubleToRawbits(42.0)};
FPMulAccHelper(config,
&MacroAssembler::Fmla,
kDRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fmla_result_d);
}
TEST_SVE(sve_fmls_fmsb) {
// fmls : zd = za - zn * zm
double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
int pg_inputs[] = {1, 0, 1, 1};
uint64_t fmls_result_h[] = {Float16ToRawbits(Float16(6.0)),
Float16ToRawbits(Float16(-99.0)),
Float16ToRawbits(Float16(-39.0)),
Float16ToRawbits(Float16(-38.0))};
// `fmsb` has been tested in the helper.
FPMulAccHelper(config,
&MacroAssembler::Fmls,
kHRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fmls_result_h);
uint64_t fmls_result_s[] = {FloatToRawbits(6.0f),
FloatToRawbits(-99.0f),
FloatToRawbits(-39.0f),
FloatToRawbits(-38.0f)};
FPMulAccHelper(config,
&MacroAssembler::Fmls,
kSRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fmls_result_s);
uint64_t fmls_result_d[] = {DoubleToRawbits(6.0),
DoubleToRawbits(-99.0),
DoubleToRawbits(-39.0),
DoubleToRawbits(-38.0)};
FPMulAccHelper(config,
&MacroAssembler::Fmls,
kDRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fmls_result_d);
}
TEST_SVE(sve_fnmla_fnmad) {
// fnmla : zd = -za - zn * zm
double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
int pg_inputs[] = {0, 1, 1, 1};
uint64_t fnmla_result_h[] = {Float16ToRawbits(Float16(84.0)),
Float16ToRawbits(Float16(-101.0)),
Float16ToRawbits(Float16(-33.0)),
Float16ToRawbits(Float16(-42.0))};
// `fnmad` has been tested in the helper.
FPMulAccHelper(config,
&MacroAssembler::Fnmla,
kHRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fnmla_result_h);
uint64_t fnmla_result_s[] = {FloatToRawbits(84.0f),
FloatToRawbits(-101.0f),
FloatToRawbits(-33.0f),
FloatToRawbits(-42.0f)};
FPMulAccHelper(config,
&MacroAssembler::Fnmla,
kSRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fnmla_result_s);
uint64_t fnmla_result_d[] = {DoubleToRawbits(84.0),
DoubleToRawbits(-101.0),
DoubleToRawbits(-33.0),
DoubleToRawbits(-42.0)};
FPMulAccHelper(config,
&MacroAssembler::Fnmla,
kDRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fnmla_result_d);
}
TEST_SVE(sve_fnmls_fnmsb) {
// fnmls : zd = -za + zn * zm
double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
int pg_inputs[] = {1, 1, 1, 0};
uint64_t fnmls_result_h[] = {Float16ToRawbits(Float16(-6.0)),
Float16ToRawbits(Float16(99.0)),
Float16ToRawbits(Float16(39.0)),
Float16ToRawbits(Float16(38.0))};
// `fnmsb` has been tested in the helper.
FPMulAccHelper(config,
&MacroAssembler::Fnmls,
kHRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fnmls_result_h);
uint64_t fnmls_result_s[] = {FloatToRawbits(-6.0f),
FloatToRawbits(99.0f),
FloatToRawbits(39.0f),
FloatToRawbits(38.0f)};
FPMulAccHelper(config,
&MacroAssembler::Fnmls,
kSRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fnmls_result_s);
uint64_t fnmls_result_d[] = {DoubleToRawbits(-6.0),
DoubleToRawbits(99.0),
DoubleToRawbits(39.0),
DoubleToRawbits(38.0)};
FPMulAccHelper(config,
&MacroAssembler::Fnmls,
kDRegSize,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
fnmls_result_d);
}
typedef void (MacroAssembler::*FPMulAccIdxFn)(const ZRegister& zd,
const ZRegister& za,
const ZRegister& zn,
const ZRegister& zm,
int index);
template <typename T, size_t N>
static void FPMulAccIdxHelper(Test* config,
FPMulAccFn macro,
FPMulAccIdxFn macro_idx,
const T (&za_inputs)[N],
const T (&zn_inputs)[N],
const T (&zm_inputs)[N]) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Ptrue(p0.VnB());
// Repeat indexed vector across up to 2048-bit VL.
for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i += N) {
InsrHelper(&masm, z30.VnD(), zm_inputs);
}
FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z30.VnH());
InsrHelper(&masm, z1.VnD(), zn_inputs);
InsrHelper(&masm, z2.VnD(), za_inputs);
__ Mov(z3, z0);
(masm.*macro_idx)(z3.VnH(), z2.VnH(), z1.VnH(), z3.VnH(), 0); // zd == zm
__ Mov(z4, z1);
(masm.*macro_idx)(z4.VnH(), z2.VnH(), z4.VnH(), z0.VnH(), 1); // zd == zn
__ Mov(z5, z2);
(masm.*macro_idx)(z5.VnH(), z5.VnH(), z1.VnH(), z0.VnH(), 4); // zd == za
(masm.*macro_idx)(z6.VnH(), z2.VnH(), z1.VnH(), z0.VnH(), 7);
FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z30.VnS());
__ Mov(z7, z0);
(masm.*macro_idx)(z7.VnS(), z2.VnS(), z1.VnS(), z7.VnS(), 0); // zd == zm
__ Mov(z8, z1);
(masm.*macro_idx)(z8.VnS(), z2.VnS(), z8.VnS(), z0.VnS(), 1); // zd == zn
__ Mov(z9, z2);
(masm.*macro_idx)(z9.VnS(), z9.VnS(), z1.VnS(), z0.VnS(), 2); // zd == za
(masm.*macro_idx)(z10.VnS(), z2.VnS(), z1.VnS(), z0.VnS(), 3);
FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
__ Mov(z11, z0);
(masm.*macro_idx)(z11.VnD(), z2.VnD(), z1.VnD(), z11.VnD(), 0); // zd == zm
__ Mov(z12, z1);
(masm.*macro_idx)(z12.VnD(), z2.VnD(), z12.VnD(), z0.VnD(), 1); // zd == zn
__ Mov(z13, z2);
(masm.*macro_idx)(z13.VnD(), z13.VnD(), z1.VnD(), z0.VnD(), 0); // zd == za
__ Mov(z14, z0);
// zd == zn == zm
(masm.*macro_idx)(z14.VnD(), z2.VnD(), z14.VnD(), z14.VnD(), 1);
// Indexed form of Fmla and Fmls won't swap argument, passing strict NaN
// propagation mode to ensure the following macros don't swap argument in
// any cases.
FPMacroNaNPropagationOption option = StrictNaNPropagation;
// Compute the results using other instructions.
__ Dup(z0.VnH(), z30.VnH(), 0);
FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
(masm.*macro)(z15.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
__ Dup(z0.VnH(), z30.VnH(), 1);
FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
(masm.*macro)(z16.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
__ Dup(z0.VnH(), z30.VnH(), 4);
FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
(masm.*macro)(z17.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
__ Dup(z0.VnH(), z30.VnH(), 7);
FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
(masm.*macro)(z18.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
__ Dup(z0.VnS(), z30.VnS(), 0);
FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
(masm.*macro)(z19.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
__ Dup(z0.VnS(), z30.VnS(), 1);
FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
(masm.*macro)(z20.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
__ Dup(z0.VnS(), z30.VnS(), 2);
FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
(masm.*macro)(z21.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
__ Dup(z0.VnS(), z30.VnS(), 3);
FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
(masm.*macro)(z22.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
__ Dup(z0.VnD(), z30.VnD(), 0);
FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
(masm.*macro)(z23.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
__ Dup(z0.VnD(), z30.VnD(), 1);
FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
(masm.*macro)(z24.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
__ Dup(z29.VnD(), z30.VnD(), 1);
FPSegmentPatternHelper(&masm, z29.VnD(), p0.Merging(), z29.VnD());
(masm.*macro)(z25.VnD(), p0.Merging(), z2.VnD(), z0.VnD(), z29.VnD(), option);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z15.VnH(), z3.VnH());
ASSERT_EQUAL_SVE(z16.VnH(), z4.VnH());
ASSERT_EQUAL_SVE(z17.VnH(), z5.VnH());
ASSERT_EQUAL_SVE(z18.VnH(), z6.VnH());
ASSERT_EQUAL_SVE(z19.VnS(), z7.VnS());
ASSERT_EQUAL_SVE(z20.VnS(), z8.VnS());
ASSERT_EQUAL_SVE(z21.VnS(), z9.VnS());
ASSERT_EQUAL_SVE(z22.VnS(), z10.VnS());
ASSERT_EQUAL_SVE(z23.VnD(), z11.VnD());
ASSERT_EQUAL_SVE(z24.VnD(), z12.VnD());
ASSERT_EQUAL_SVE(z11.VnD(), z13.VnD());
ASSERT_EQUAL_SVE(z25.VnD(), z14.VnD());
}
}
TEST_SVE(sve_fmla_fmls_index) {
uint64_t zm_inputs_1[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
uint64_t zn_inputs_1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
uint64_t za_inputs_1[] = {0x3c004000bc00c000, 0x64006800e400e800};
// Using the vector form of Fmla and Fmls to verify the indexed form.
FPMulAccIdxHelper(config,
&MacroAssembler::Fmla, // vector form
&MacroAssembler::Fmla, // indexed form
za_inputs_1,
zn_inputs_1,
zm_inputs_1);
FPMulAccIdxHelper(config,
&MacroAssembler::Fmls, // vector form
&MacroAssembler::Fmls, // indexed form
za_inputs_1,
zn_inputs_1,
zm_inputs_1);
uint64_t zm_inputs_2[] = {0x7ff5555511111111, // NaN
0xfff0000000000000}; // Infinity
uint64_t zn_inputs_2[] = {0x7f9511117fc00000, // NaN
0x7f800000ff800000}; // Infinity
uint64_t za_inputs_2[] = {0x7c11000000007e00, // NaN
0x000000007c00fc00}; // Infinity
FPMulAccIdxHelper(config,
&MacroAssembler::Fmla, // vector form
&MacroAssembler::Fmla, // indexed form
za_inputs_2,
zn_inputs_2,
zm_inputs_2);
FPMulAccIdxHelper(config,
&MacroAssembler::Fmls, // vector form
&MacroAssembler::Fmls, // indexed form
za_inputs_2,
zn_inputs_2,
zm_inputs_2);
}
// Execute a number of instructions which all use ProcessNaNs, and check that
// they all propagate NaNs correctly.
template <typename Ti, typename Td, size_t N>
static void ProcessNaNsHelper(Test* config,
int lane_size_in_bits,
const Ti (&zn_inputs)[N],
const Ti (&zm_inputs)[N],
const Td (&zd_expected)[N],
FPMacroNaNPropagationOption nan_option) {
ArithFn arith_unpredicated_macro[] = {&MacroAssembler::Fadd,
&MacroAssembler::Fsub,
&MacroAssembler::Fmul};
for (size_t i = 0; i < ArrayLength(arith_unpredicated_macro); i++) {
FPBinArithHelper(config,
arith_unpredicated_macro[i],
lane_size_in_bits,
zn_inputs,
zm_inputs,
zd_expected);
}
FPArithPredicatedFn arith_predicated_macro[] = {&MacroAssembler::Fmax,
&MacroAssembler::Fmin};
int pg_inputs[N];
// With an all-true predicate, this helper aims to compare with special
// numbers.
for (size_t i = 0; i < N; i++) {
pg_inputs[i] = 1;
}
// fdivr propagates the quotient (Zm) preferentially, so we don't actually
// need any special handling for StrictNaNPropagation.
FPBinArithHelper(config,
NULL,
&MacroAssembler::Fdiv,
lane_size_in_bits,
// With an all-true predicate, the value in zd is
// irrelevant to the operations.
zn_inputs,
pg_inputs,
zn_inputs,
zm_inputs,
zd_expected);
for (size_t i = 0; i < ArrayLength(arith_predicated_macro); i++) {
FPBinArithHelper(config,
arith_predicated_macro[i],
NULL,
lane_size_in_bits,
// With an all-true predicate, the value in zd is
// irrelevant to the operations.
zn_inputs,
pg_inputs,
zn_inputs,
zm_inputs,
zd_expected,
nan_option);
}
}
template <typename Ti, typename Td, size_t N>
static void ProcessNaNsHelper3(Test* config,
int lane_size_in_bits,
const Ti (&za_inputs)[N],
const Ti (&zn_inputs)[N],
const Ti (&zm_inputs)[N],
const Td (&zd_expected_fmla)[N],
const Td (&zd_expected_fmls)[N],
const Td (&zd_expected_fnmla)[N],
const Td (&zd_expected_fnmls)[N],
FPMacroNaNPropagationOption nan_option) {
int pg_inputs[N];
// With an all-true predicate, this helper aims to compare with special
// numbers.
for (size_t i = 0; i < N; i++) {
pg_inputs[i] = 1;
}
FPMulAccHelper(config,
&MacroAssembler::Fmla,
lane_size_in_bits,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_fmla,
nan_option);
FPMulAccHelper(config,
&MacroAssembler::Fmls,
lane_size_in_bits,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_fmls,
nan_option);
FPMulAccHelper(config,
&MacroAssembler::Fnmla,
lane_size_in_bits,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_fnmla,
nan_option);
FPMulAccHelper(config,
&MacroAssembler::Fnmls,
lane_size_in_bits,
pg_inputs,
za_inputs,
zn_inputs,
zm_inputs,
zd_expected_fnmls,
nan_option);
}
TEST_SVE(sve_process_nans_double) {
// Use non-standard NaNs to check that the payload bits are preserved.
double sa = RawbitsToDouble(0x7ff5555511111111);
double sn = RawbitsToDouble(0x7ff5555522222222);
double sm = RawbitsToDouble(0x7ff5555533333333);
double qa = RawbitsToDouble(0x7ffaaaaa11111111);
double qn = RawbitsToDouble(0x7ffaaaaa22222222);
double qm = RawbitsToDouble(0x7ffaaaaa33333333);
VIXL_ASSERT(IsSignallingNaN(sa));
VIXL_ASSERT(IsSignallingNaN(sn));
VIXL_ASSERT(IsSignallingNaN(sm));
VIXL_ASSERT(IsQuietNaN(qa));
VIXL_ASSERT(IsQuietNaN(qn));
VIXL_ASSERT(IsQuietNaN(qm));
// The input NaNs after passing through ProcessNaN.
uint64_t sa_proc = 0x7ffd555511111111;
uint64_t sn_proc = 0x7ffd555522222222;
uint64_t sm_proc = 0x7ffd555533333333;
uint64_t qa_proc = DoubleToRawbits(qa);
uint64_t qn_proc = DoubleToRawbits(qn);
uint64_t qm_proc = DoubleToRawbits(qm);
uint64_t sa_proc_n = sa_proc ^ kDSignMask;
uint64_t sn_proc_n = sn_proc ^ kDSignMask;
uint64_t qa_proc_n = qa_proc ^ kDSignMask;
uint64_t qn_proc_n = qn_proc ^ kDSignMask;
// Quiet NaNs are propagated.
double zn_inputs_1[] = {qn, 0.0, 0.0, qm, qn, qm};
double zm_inputs_1[] = {0.0, qn, qm, 0.0, qm, qn};
uint64_t zd_expected_1[] =
{qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
ProcessNaNsHelper(config,
kDRegSize,
zn_inputs_1,
zm_inputs_1,
zd_expected_1,
StrictNaNPropagation);
// Signalling NaNs are propagated.
double zn_inputs_2[] = {sn, 0.0, 0.0, sm, sn, sm};
double zm_inputs_2[] = {0.0, sn, sm, 0.0, sm, sn};
uint64_t zd_expected_2[] =
{sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
ProcessNaNsHelper(config,
kDRegSize,
zn_inputs_2,
zm_inputs_2,
zd_expected_2,
StrictNaNPropagation);
// Signalling NaNs take precedence over quiet NaNs.
double zn_inputs_3[] = {sn, qn, sn, sn, qn};
double zm_inputs_3[] = {qm, sm, sm, qn, sn};
uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
ProcessNaNsHelper(config,
kDRegSize,
zn_inputs_3,
zm_inputs_3,
zd_expected_3,
StrictNaNPropagation);
double za_inputs_4[] = {qa, qa, 0.0, 0.0, qa, qa};
double zn_inputs_4[] = {qn, 0.0, 0.0, qn, qn, qn};
double zm_inputs_4[] = {0.0, qm, qm, qm, qm, 0.0};
// If `a` is propagated, its sign is inverted by fnmla and fnmls.
// If `n` is propagated, its sign is inverted by fmls and fnmla.
// If `m` is propagated, its sign is never inverted.
uint64_t zd_expected_fmla_4[] =
{qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
uint64_t zd_expected_fmls_4[] =
{qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
uint64_t zd_expected_fnmla_4[] =
{qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
uint64_t zd_expected_fnmls_4[] =
{qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
ProcessNaNsHelper3(config,
kDRegSize,
za_inputs_4,
zn_inputs_4,
zm_inputs_4,
zd_expected_fmla_4,
zd_expected_fmls_4,
zd_expected_fnmla_4,
zd_expected_fnmls_4,
StrictNaNPropagation);
// Signalling NaNs take precedence over quiet NaNs.
double za_inputs_5[] = {qa, qa, sa, sa, sa};
double zn_inputs_5[] = {qn, sn, sn, sn, qn};
double zm_inputs_5[] = {sm, qm, sm, qa, sm};
uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
uint64_t zd_expected_fmls_5[] = {sm_proc,
sn_proc_n,
sa_proc,
sa_proc,
sa_proc};
uint64_t zd_expected_fnmla_5[] = {sm_proc,
sn_proc_n,
sa_proc_n,
sa_proc_n,
sa_proc_n};
uint64_t zd_expected_fnmls_5[] = {sm_proc,
sn_proc,
sa_proc_n,
sa_proc_n,
sa_proc_n};
ProcessNaNsHelper3(config,
kDRegSize,
za_inputs_5,
zn_inputs_5,
zm_inputs_5,
zd_expected_fmla_5,
zd_expected_fmls_5,
zd_expected_fnmla_5,
zd_expected_fnmls_5,
StrictNaNPropagation);
const double inf = kFP64PositiveInfinity;
const double inf_n = kFP64NegativeInfinity;
uint64_t inf_proc = DoubleToRawbits(inf);
uint64_t inf_proc_n = DoubleToRawbits(inf_n);
uint64_t d_inf_proc = DoubleToRawbits(kFP64DefaultNaN);
double za_inputs_6[] = {qa, qa, 0.0f, -0.0f, qa, sa};
double zn_inputs_6[] = {inf, -0.0f, -0.0f, inf, inf_n, inf};
double zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
// quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
// (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
// quiet_nan.
uint64_t zd_expected_fmla_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
uint64_t zd_expected_fmls_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
uint64_t zd_expected_fnmla_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
uint64_t zd_expected_fnmls_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
ProcessNaNsHelper3(config,
kDRegSize,
za_inputs_6,
zn_inputs_6,
zm_inputs_6,
zd_expected_fmla_6,
zd_expected_fmls_6,
zd_expected_fnmla_6,
zd_expected_fnmls_6,
StrictNaNPropagation);
}
TEST_SVE(sve_process_nans_float) {
// Use non-standard NaNs to check that the payload bits are preserved.
float sa = RawbitsToFloat(0x7f951111);
float sn = RawbitsToFloat(0x7f952222);
float sm = RawbitsToFloat(0x7f953333);
float qa = RawbitsToFloat(0x7fea1111);
float qn = RawbitsToFloat(0x7fea2222);
float qm = RawbitsToFloat(0x7fea3333);
VIXL_ASSERT(IsSignallingNaN(sa));
VIXL_ASSERT(IsSignallingNaN(sn));
VIXL_ASSERT(IsSignallingNaN(sm));
VIXL_ASSERT(IsQuietNaN(qa));
VIXL_ASSERT(IsQuietNaN(qn));
VIXL_ASSERT(IsQuietNaN(qm));
// The input NaNs after passing through ProcessNaN.
uint32_t sa_proc = 0x7fd51111;
uint32_t sn_proc = 0x7fd52222;
uint32_t sm_proc = 0x7fd53333;
uint32_t qa_proc = FloatToRawbits(qa);
uint32_t qn_proc = FloatToRawbits(qn);
uint32_t qm_proc = FloatToRawbits(qm);
uint32_t sa_proc_n = sa_proc ^ kSSignMask;
uint32_t sn_proc_n = sn_proc ^ kSSignMask;
uint32_t qa_proc_n = qa_proc ^ kSSignMask;
uint32_t qn_proc_n = qn_proc ^ kSSignMask;
// Quiet NaNs are propagated.
float zn_inputs_1[] = {qn, 0.0f, 0.0f, qm, qn, qm};
float zm_inputs_1[] = {0.0f, qn, qm, 0.0f, qm, qn};
uint64_t zd_expected_1[] =
{qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
ProcessNaNsHelper(config,
kSRegSize,
zn_inputs_1,
zm_inputs_1,
zd_expected_1,
StrictNaNPropagation);
// Signalling NaNs are propagated.
float zn_inputs_2[] = {sn, 0.0f, 0.0f, sm, sn, sm};
float zm_inputs_2[] = {0.0f, sn, sm, 0.0f, sm, sn};
uint64_t zd_expected_2[] =
{sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
ProcessNaNsHelper(config,
kSRegSize,
zn_inputs_2,
zm_inputs_2,
zd_expected_2,
StrictNaNPropagation);
// Signalling NaNs take precedence over quiet NaNs.
float zn_inputs_3[] = {sn, qn, sn, sn, qn};
float zm_inputs_3[] = {qm, sm, sm, qn, sn};
uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
ProcessNaNsHelper(config,
kSRegSize,
zn_inputs_3,
zm_inputs_3,
zd_expected_3,
StrictNaNPropagation);
float za_inputs_4[] = {qa, qa, 0.0f, 0.0f, qa, qa};
float zn_inputs_4[] = {qn, 0.0f, 0.0f, qn, qn, qn};
float zm_inputs_4[] = {0.0f, qm, qm, qm, qm, 0.0f};
// If `a` is propagated, its sign is inverted by fnmla and fnmls.
// If `n` is propagated, its sign is inverted by fmls and fnmla.
// If `m` is propagated, its sign is never inverted.
uint64_t zd_expected_fmla_4[] =
{qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
uint64_t zd_expected_fmls_4[] =
{qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
uint64_t zd_expected_fnmla_4[] =
{qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
uint64_t zd_expected_fnmls_4[] =
{qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
ProcessNaNsHelper3(config,
kSRegSize,
za_inputs_4,
zn_inputs_4,
zm_inputs_4,
zd_expected_fmla_4,
zd_expected_fmls_4,
zd_expected_fnmla_4,
zd_expected_fnmls_4,
StrictNaNPropagation);
// Signalling NaNs take precedence over quiet NaNs.
float za_inputs_5[] = {qa, qa, sa, sa, sa};
float zn_inputs_5[] = {qn, sn, sn, sn, qn};
float zm_inputs_5[] = {sm, qm, sm, qa, sm};
uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
uint64_t zd_expected_fmls_5[] = {sm_proc,
sn_proc_n,
sa_proc,
sa_proc,
sa_proc};
uint64_t zd_expected_fnmla_5[] = {sm_proc,
sn_proc_n,
sa_proc_n,
sa_proc_n,
sa_proc_n};
uint64_t zd_expected_fnmls_5[] = {sm_proc,
sn_proc,
sa_proc_n,
sa_proc_n,
sa_proc_n};
ProcessNaNsHelper3(config,
kSRegSize,
za_inputs_5,
zn_inputs_5,
zm_inputs_5,
zd_expected_fmla_5,
zd_expected_fmls_5,
zd_expected_fnmla_5,
zd_expected_fnmls_5,
StrictNaNPropagation);
const float inf = kFP32PositiveInfinity;
const float inf_n = kFP32NegativeInfinity;
uint32_t inf_proc = FloatToRawbits(inf);
uint32_t inf_proc_n = FloatToRawbits(inf_n);
uint32_t d_inf_proc = FloatToRawbits(kFP32DefaultNaN);
float za_inputs_6[] = {qa, qa, 0.0f, 0.0f, qa, sa};
float zn_inputs_6[] = {inf, 0.0f, 0.0f, inf, inf_n, inf};
float zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
// quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
// (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
// quiet_nan.
uint64_t zd_expected_fmla_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
uint64_t zd_expected_fmls_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
uint64_t zd_expected_fnmla_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
uint64_t zd_expected_fnmls_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
ProcessNaNsHelper3(config,
kSRegSize,
za_inputs_6,
zn_inputs_6,
zm_inputs_6,
zd_expected_fmla_6,
zd_expected_fmls_6,
zd_expected_fnmla_6,
zd_expected_fnmls_6,
StrictNaNPropagation);
}
TEST_SVE(sve_process_nans_half) {
// Use non-standard NaNs to check that the payload bits are preserved.
Float16 sa(RawbitsToFloat16(0x7c11));
Float16 sn(RawbitsToFloat16(0x7c22));
Float16 sm(RawbitsToFloat16(0x7c33));
Float16 qa(RawbitsToFloat16(0x7e44));
Float16 qn(RawbitsToFloat16(0x7e55));
Float16 qm(RawbitsToFloat16(0x7e66));
VIXL_ASSERT(IsSignallingNaN(sa));
VIXL_ASSERT(IsSignallingNaN(sn));
VIXL_ASSERT(IsSignallingNaN(sm));
VIXL_ASSERT(IsQuietNaN(qa));
VIXL_ASSERT(IsQuietNaN(qn));
VIXL_ASSERT(IsQuietNaN(qm));
// The input NaNs after passing through ProcessNaN.
uint16_t sa_proc = 0x7e11;
uint16_t sn_proc = 0x7e22;
uint16_t sm_proc = 0x7e33;
uint16_t qa_proc = Float16ToRawbits(qa);
uint16_t qn_proc = Float16ToRawbits(qn);
uint16_t qm_proc = Float16ToRawbits(qm);
uint16_t sa_proc_n = sa_proc ^ kHSignMask;
uint16_t sn_proc_n = sn_proc ^ kHSignMask;
uint16_t qa_proc_n = qa_proc ^ kHSignMask;
uint16_t qn_proc_n = qn_proc ^ kHSignMask;
Float16 zero(0.0);
// Quiet NaNs are propagated.
Float16 zn_inputs_1[] = {qn, zero, zero, qm, qn, qm};
Float16 zm_inputs_1[] = {zero, qn, qm, zero, qm, qn};
uint64_t zd_expected_1[] =
{qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
ProcessNaNsHelper(config,
kHRegSize,
zn_inputs_1,
zm_inputs_1,
zd_expected_1,
StrictNaNPropagation);
// Signalling NaNs are propagated.
Float16 zn_inputs_2[] = {sn, zero, zero, sm, sn, sm};
Float16 zm_inputs_2[] = {zero, sn, sm, zero, sm, sn};
uint64_t zd_expected_2[] =
{sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
ProcessNaNsHelper(config,
kHRegSize,
zn_inputs_2,
zm_inputs_2,
zd_expected_2,
StrictNaNPropagation);
// Signalling NaNs take precedence over quiet NaNs.
Float16 zn_inputs_3[] = {sn, qn, sn, sn, qn};
Float16 zm_inputs_3[] = {qm, sm, sm, qn, sn};
uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
ProcessNaNsHelper(config,
kHRegSize,
zn_inputs_3,
zm_inputs_3,
zd_expected_3,
StrictNaNPropagation);
Float16 za_inputs_4[] = {qa, qa, zero, zero, qa, qa};
Float16 zn_inputs_4[] = {qn, zero, zero, qn, qn, qn};
Float16 zm_inputs_4[] = {zero, qm, qm, qm, qm, zero};
// If `a` is propagated, its sign is inverted by fnmla and fnmls.
// If `n` is propagated, its sign is inverted by fmls and fnmla.
// If `m` is propagated, its sign is never inverted.
uint64_t zd_expected_fmla_4[] =
{qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
uint64_t zd_expected_fmls_4[] =
{qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
uint64_t zd_expected_fnmla_4[] =
{qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
uint64_t zd_expected_fnmls_4[] =
{qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
ProcessNaNsHelper3(config,
kHRegSize,
za_inputs_4,
zn_inputs_4,
zm_inputs_4,
zd_expected_fmla_4,
zd_expected_fmls_4,
zd_expected_fnmla_4,
zd_expected_fnmls_4,
StrictNaNPropagation);
// Signalling NaNs take precedence over quiet NaNs.
Float16 za_inputs_5[] = {qa, qa, sa, sa, sa};
Float16 zn_inputs_5[] = {qn, sn, sn, sn, qn};
Float16 zm_inputs_5[] = {sm, qm, sm, qa, sm};
uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
uint64_t zd_expected_fmls_5[] = {sm_proc,
sn_proc_n,
sa_proc,
sa_proc,
sa_proc};
uint64_t zd_expected_fnmla_5[] = {sm_proc,
sn_proc_n,
sa_proc_n,
sa_proc_n,
sa_proc_n};
uint64_t zd_expected_fnmls_5[] = {sm_proc,
sn_proc,
sa_proc_n,
sa_proc_n,
sa_proc_n};
ProcessNaNsHelper3(config,
kHRegSize,
za_inputs_5,
zn_inputs_5,
zm_inputs_5,
zd_expected_fmla_5,
zd_expected_fmls_5,
zd_expected_fnmla_5,
zd_expected_fnmls_5,
StrictNaNPropagation);
const Float16 inf = kFP16PositiveInfinity;
const Float16 inf_n = kFP16NegativeInfinity;
uint64_t inf_proc = Float16ToRawbits(inf);
uint64_t inf_proc_n = Float16ToRawbits(inf_n);
uint64_t d_inf_proc = Float16ToRawbits(kFP16DefaultNaN);
Float16 za_inputs_6[] = {qa, qa, zero, zero, qa, sa};
Float16 zn_inputs_6[] = {inf, zero, zero, inf, inf_n, inf};
Float16 zm_inputs_6[] = {zero, inf_n, inf, inf, inf, zero};
// quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
// (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
// quiet_nan.
uint64_t zd_expected_fmla_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
uint64_t zd_expected_fmls_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
uint64_t zd_expected_fnmla_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
uint64_t zd_expected_fnmls_6[] =
{d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
ProcessNaNsHelper3(config,
kHRegSize,
za_inputs_6,
zn_inputs_6,
zm_inputs_6,
zd_expected_fmla_6,
zd_expected_fmls_6,
zd_expected_fnmla_6,
zd_expected_fnmls_6,
StrictNaNPropagation);
}
typedef void (MacroAssembler::*FCmpFn)(const PRegisterWithLaneSize& pd,
const PRegisterZ& pg,
const ZRegister& zn,
const ZRegister& zm);
typedef void (MacroAssembler::*FCmpZeroFn)(const PRegisterWithLaneSize& pd,
const PRegisterZ& pg,
const ZRegister& zn,
double zero);
typedef void (MacroAssembler::*CmpFn)(const PRegisterWithLaneSize& pd,
const PRegisterZ& pg,
const ZRegister& zn,
const ZRegister& zm);
static FCmpFn GetFpAbsCompareFn(Condition cond) {
switch (cond) {
case ge:
return &MacroAssembler::Facge;
case gt:
return &MacroAssembler::Facgt;
case le:
return &MacroAssembler::Facle;
case lt:
return &MacroAssembler::Faclt;
default:
VIXL_UNIMPLEMENTED();
return NULL;
}
}
static FCmpFn GetFpCompareFn(Condition cond) {
switch (cond) {
case ge:
return &MacroAssembler::Fcmge;
case gt:
return &MacroAssembler::Fcmgt;
case le:
return &MacroAssembler::Fcmle;
case lt:
return &MacroAssembler::Fcmlt;
case eq:
return &MacroAssembler::Fcmeq;
case ne:
return &MacroAssembler::Fcmne;
case uo:
return &MacroAssembler::Fcmuo;
default:
VIXL_UNIMPLEMENTED();
return NULL;
}
}
static FCmpZeroFn GetFpCompareZeroFn(Condition cond) {
switch (cond) {
case ge:
return &MacroAssembler::Fcmge;
case gt:
return &MacroAssembler::Fcmgt;
case le:
return &MacroAssembler::Fcmle;
case lt:
return &MacroAssembler::Fcmlt;
case eq:
return &MacroAssembler::Fcmeq;
case ne:
return &MacroAssembler::Fcmne;
default:
VIXL_UNIMPLEMENTED();
return NULL;
}
}
static CmpFn GetIntCompareFn(Condition cond) {
switch (cond) {
case ge:
return &MacroAssembler::Cmpge;
case gt:
return &MacroAssembler::Cmpgt;
case le:
return &MacroAssembler::Cmple;
case lt:
return &MacroAssembler::Cmplt;
case eq:
return &MacroAssembler::Cmpeq;
case ne:
return &MacroAssembler::Cmpne;
default:
VIXL_UNIMPLEMENTED();
return NULL;
}
}
template <size_t N>
static void TestFpCompareHelper(Test* config,
int lane_size_in_bits,
Condition cond,
const double (&zn_inputs)[N],
const double (&zm_inputs)[N],
const int (&pd_expected)[N],
bool is_absolute = false) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
ZRegister zt_int_1 = z1.WithLaneSize(lane_size_in_bits);
ZRegister zt_int_2 = z2.WithLaneSize(lane_size_in_bits);
ZRegister zt_int_3 = z3.WithLaneSize(lane_size_in_bits);
ZRegister zt_fp_1 = z11.WithLaneSize(lane_size_in_bits);
ZRegister zt_fp_2 = z12.WithLaneSize(lane_size_in_bits);
ZRegister zt_fp_3 = z13.WithLaneSize(lane_size_in_bits);
ZRegister fp_one = z31.WithLaneSize(lane_size_in_bits);
PRegisterWithLaneSize pd_result_int_1 = p15.WithLaneSize(lane_size_in_bits);
PRegisterWithLaneSize pd_result_fp_1 = p14.WithLaneSize(lane_size_in_bits);
PRegisterWithLaneSize pd_result_int_2 = p13.WithLaneSize(lane_size_in_bits);
PRegisterWithLaneSize pd_result_fp_2 = p12.WithLaneSize(lane_size_in_bits);
FCmpFn fcmp = is_absolute ? GetFpAbsCompareFn(cond) : GetFpCompareFn(cond);
__ Ptrue(p1.VnB());
if (cond != uo) {
int pg_inputs[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1};
Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
__ Fdup(fp_one, 0.1f);
__ Index(zt_int_1, 3, 3);
__ Scvtf(zt_fp_1, p0.Merging(), zt_int_1);
__ Fadd(zt_fp_1, zt_fp_1, fp_one);
__ Index(zt_int_2, 3, -10);
__ Scvtf(zt_fp_2, p0.Merging(), zt_int_2);
__ Fadd(zt_fp_2, zt_fp_2, fp_one);
__ Index(zt_int_3, 3, 2);
__ Scvtf(zt_fp_3, p0.Merging(), zt_int_3);
__ Fadd(zt_fp_3, zt_fp_3, fp_one);
// There is no absolute comparison in integer type, use `abs` with `cmp<cc>`
// to synthesize the expected result for `fac<cc>`.
if (is_absolute == true) {
__ Abs(zt_int_2, p1.Merging(), zt_int_2);
}
CmpFn cmp = GetIntCompareFn(cond);
(masm.*cmp)(pd_result_int_1, p0.Zeroing(), zt_int_1, zt_int_2);
(masm.*fcmp)(pd_result_fp_1, p0.Zeroing(), zt_fp_1, zt_fp_2);
(masm.*cmp)(pd_result_int_2, p0.Zeroing(), zt_int_1, zt_int_3);
(masm.*fcmp)(pd_result_fp_2, p0.Zeroing(), zt_fp_1, zt_fp_3);
}
uint64_t zn_inputs_rawbits[N];
uint64_t zm_inputs_rawbits[N];
FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
ZRegister zn_fp = z14.WithLaneSize(lane_size_in_bits);
ZRegister zm_fp = z15.WithLaneSize(lane_size_in_bits);
InsrHelper(&masm, zn_fp, zn_inputs_rawbits);
InsrHelper(&masm, zm_fp, zm_inputs_rawbits);
PRegisterWithLaneSize pd_result_fp_3 = p11.WithLaneSize(lane_size_in_bits);
(masm.*fcmp)(pd_result_fp_3, p1.Zeroing(), zn_fp, zm_fp);
END();
if (CAN_RUN()) {
RUN();
if (cond != uo) {
ASSERT_EQUAL_SVE(pd_result_int_1, pd_result_fp_1);
ASSERT_EQUAL_SVE(pd_result_int_2, pd_result_fp_2);
}
ASSERT_EQUAL_SVE(pd_expected, pd_result_fp_3);
}
}
TEST_SVE(sve_fp_compare_vectors) {
double inf_p = kFP64PositiveInfinity;
double inf_n = kFP64NegativeInfinity;
double nan = kFP64DefaultNaN;
// Normal floating point comparison has been tested in the helper.
double zn[] = {0.0, inf_n, 1.0, inf_p, inf_p, nan, 0.0, nan};
double zm[] = {-0.0, inf_n, inf_n, -2.0, inf_n, nan, nan, inf_p};
int pd_fcm_gt[] = {0, 0, 1, 1, 1, 0, 0, 0};
int pd_fcm_lt[] = {0, 0, 0, 0, 0, 0, 0, 0};
int pd_fcm_ge[] = {1, 1, 1, 1, 1, 0, 0, 0};
int pd_fcm_le[] = {1, 1, 0, 0, 0, 0, 0, 0};
int pd_fcm_eq[] = {1, 1, 0, 0, 0, 0, 0, 0};
int pd_fcm_ne[] = {0, 0, 1, 1, 1, 1, 1, 1};
int pd_fcm_uo[] = {0, 0, 0, 0, 0, 1, 1, 1};
int pd_fac_gt[] = {0, 0, 0, 1, 0, 0, 0, 0};
int pd_fac_lt[] = {0, 0, 1, 0, 0, 0, 0, 0};
int pd_fac_ge[] = {1, 1, 0, 1, 1, 0, 0, 0};
int pd_fac_le[] = {1, 1, 1, 0, 1, 0, 0, 0};
int lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
for (size_t i = 0; i < ArrayLength(lane_sizes); i++) {
int lane_size = lane_sizes[i];
// Test floating-point compare vectors.
TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fcm_gt);
TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fcm_lt);
TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fcm_ge);
TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fcm_le);
TestFpCompareHelper(config, lane_size, eq, zn, zm, pd_fcm_eq);
TestFpCompareHelper(config, lane_size, ne, zn, zm, pd_fcm_ne);
TestFpCompareHelper(config, lane_size, uo, zn, zm, pd_fcm_uo);
// Test floating-point absolute compare vectors.
TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fac_gt, true);
TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fac_lt, true);
TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fac_ge, true);
TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fac_le, true);
}
}
template <size_t N, typename T>
static void TestFpCompareZeroHelper(Test* config,
int lane_size_in_bits,
Condition cond,
const T (&zn_inputs)[N],
const int (&pd_expected)[N]) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
PRegisterWithLaneSize pd = p14.WithLaneSize(lane_size_in_bits);
uint64_t zn_rawbits[N];
FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
InsrHelper(&masm, zn, zn_rawbits);
__ Ptrue(p0.VnB());
(masm.*GetFpCompareZeroFn(cond))(pd, p0.Zeroing(), zn, 0.0);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(pd_expected, pd);
}
}
TEST_SVE(sve_fp_compare_vector_zero) {
Float16 fp16_inf_p = kFP16PositiveInfinity;
Float16 fp16_inf_n = kFP16NegativeInfinity;
Float16 fp16_dn = kFP16DefaultNaN;
Float16 fp16_sn = RawbitsToFloat16(0x7c22);
Float16 fp16_qn = RawbitsToFloat16(0x7e55);
float fp32_inf_p = kFP32PositiveInfinity;
float fp32_inf_n = kFP32NegativeInfinity;
float fp32_dn = kFP32DefaultNaN;
float fp32_sn = RawbitsToFloat(0x7f952222);
float fp32_qn = RawbitsToFloat(0x7fea2222);
double fp64_inf_p = kFP64PositiveInfinity;
double fp64_inf_n = kFP64NegativeInfinity;
double fp64_dn = kFP64DefaultNaN;
double fp64_sn = RawbitsToDouble(0x7ff5555511111111);
double fp64_qn = RawbitsToDouble(0x7ffaaaaa11111111);
// Normal floating point comparison has been tested in the non-zero form.
Float16 zn_inputs_h[] = {Float16(0.0),
Float16(-0.0),
fp16_inf_p,
fp16_inf_n,
fp16_dn,
fp16_sn,
fp16_qn};
float zn_inputs_s[] =
{0.0, -0.0, fp32_inf_p, fp32_inf_n, fp32_dn, fp32_sn, fp32_qn};
double zn_inputs_d[] =
{0.0, -0.0, fp64_inf_p, fp64_inf_n, fp64_dn, fp64_sn, fp64_qn};
int pd_expected_gt[] = {0, 0, 1, 0, 0, 0, 0};
int pd_expected_lt[] = {0, 0, 0, 1, 0, 0, 0};
int pd_expected_ge[] = {1, 1, 1, 0, 0, 0, 0};
int pd_expected_le[] = {1, 1, 0, 1, 0, 0, 0};
int pd_expected_eq[] = {1, 1, 0, 0, 0, 0, 0};
int pd_expected_ne[] = {0, 0, 1, 1, 1, 1, 1};
TestFpCompareZeroHelper(config, kDRegSize, gt, zn_inputs_d, pd_expected_gt);
TestFpCompareZeroHelper(config, kDRegSize, lt, zn_inputs_d, pd_expected_lt);
TestFpCompareZeroHelper(config, kDRegSize, ge, zn_inputs_d, pd_expected_ge);
TestFpCompareZeroHelper(config, kDRegSize, le, zn_inputs_d, pd_expected_le);
TestFpCompareZeroHelper(config, kDRegSize, eq, zn_inputs_d, pd_expected_eq);
TestFpCompareZeroHelper(config, kDRegSize, ne, zn_inputs_d, pd_expected_ne);
TestFpCompareZeroHelper(config, kSRegSize, gt, zn_inputs_s, pd_expected_gt);
TestFpCompareZeroHelper(config, kSRegSize, lt, zn_inputs_s, pd_expected_lt);
TestFpCompareZeroHelper(config, kSRegSize, ge, zn_inputs_s, pd_expected_ge);
TestFpCompareZeroHelper(config, kSRegSize, le, zn_inputs_s, pd_expected_le);
TestFpCompareZeroHelper(config, kSRegSize, eq, zn_inputs_s, pd_expected_eq);
TestFpCompareZeroHelper(config, kSRegSize, ne, zn_inputs_s, pd_expected_ne);
TestFpCompareZeroHelper(config, kHRegSize, gt, zn_inputs_h, pd_expected_gt);
TestFpCompareZeroHelper(config, kHRegSize, lt, zn_inputs_h, pd_expected_lt);
TestFpCompareZeroHelper(config, kHRegSize, ge, zn_inputs_h, pd_expected_ge);
TestFpCompareZeroHelper(config, kHRegSize, le, zn_inputs_h, pd_expected_le);
TestFpCompareZeroHelper(config, kHRegSize, eq, zn_inputs_h, pd_expected_eq);
TestFpCompareZeroHelper(config, kHRegSize, ne, zn_inputs_h, pd_expected_ne);
}
typedef void (MacroAssembler::*FPUnaryMFn)(const ZRegister& zd,
const PRegisterM& pg,
const ZRegister& zn);
typedef void (MacroAssembler::*FPUnaryZFn)(const ZRegister& zd,
const PRegisterZ& pg,
const ZRegister& zn);
template <size_t N, size_t M>
static void TestFPUnaryPredicatedHelper(Test* config,
int src_size_in_bits,
int dst_size_in_bits,
uint64_t (&zn_inputs)[N],
const uint64_t (&pg_inputs)[M],
const uint64_t (&zd_expected)[N],
FPUnaryMFn macro_m,
FPUnaryZFn macro_z) {
// Provide the full predicate input.
VIXL_ASSERT(M == (kPRegMaxSize / kDRegSize));
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
int ds = dst_size_in_bits;
int ss = src_size_in_bits;
int ls = std::max(ss, ds);
// When destination type is larger than source type, fill the high parts with
// noise values, which should be ignored.
if (ds > ss) {
VIXL_ASSERT(ss < 64);
uint64_t zn_inputs_mod[N];
uint64_t sn = GetSignallingNan(ss);
for (unsigned i = 0; i < N; i++) {
zn_inputs_mod[i] = zn_inputs[i] | ((sn + i) << ss);
}
InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs_mod);
} else {
InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs);
}
// Make a copy so we can check that constructive operations preserve zn.
__ Mov(z28, z29);
// Run the operation on all lanes.
__ Ptrue(p0.WithLaneSize(ls));
(masm.*macro_m)(z27.WithLaneSize(ds), p0.Merging(), z28.WithLaneSize(ss));
Initialise(&masm,
p1.VnB(),
pg_inputs[3],
pg_inputs[2],
pg_inputs[1],
pg_inputs[0]);
// Clear the irrelevant lanes.
__ Index(z31.WithLaneSize(ls), 0, 1);
__ Cmplt(p1.WithLaneSize(ls), p1.Zeroing(), z31.WithLaneSize(ls), N);
// Check merging predication.
__ Index(z11.WithLaneSize(ls), 42, 1);
// Preserve the base value so we can derive the expected result.
__ Mov(z21, z11);
__ Mov(z9, z11);
(masm.*macro_m)(z11.WithLaneSize(ds), p1.Merging(), z28.WithLaneSize(ss));
// Generate expected values using explicit merging operations.
InsrHelper(&masm, z25.WithLaneSize(ls), zd_expected);
__ Mov(z21.WithLaneSize(ls), p1.Merging(), z25.WithLaneSize(ls));
// Check zeroing predication.
__ Index(z12.WithLaneSize(ds), 42, -1);
(masm.*macro_z)(z12.WithLaneSize(ds), p1.Zeroing(), z28.WithLaneSize(ss));
// Generate expected values using explicit zeroing operations.
InsrHelper(&masm, z30.WithLaneSize(ls), zd_expected);
// Emulate zeroing predication.
__ Dup(z22.WithLaneSize(ls), 0);
__ Mov(z22.WithLaneSize(ls), p1.Merging(), z30.WithLaneSize(ls));
// Check an in-place update.
__ Mov(z9.WithLaneSize(ls), p1.Merging(), z28.WithLaneSize(ls));
(masm.*macro_m)(z9.WithLaneSize(ds), p1.Merging(), z9.WithLaneSize(ss));
END();
if (CAN_RUN()) {
RUN();
// Check all lanes.
ASSERT_EQUAL_SVE(zd_expected, z27.WithLaneSize(ls));
// Check that constructive operations preserve their inputs.
ASSERT_EQUAL_SVE(z28, z29);
// Check merging predication.
ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z21.WithLaneSize(ls));
// Check zeroing predication.
ASSERT_EQUAL_SVE(z22.WithLaneSize(ls), z12.WithLaneSize(ls));
// Check in-place operation where zd == zn.
ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z9.WithLaneSize(ls));
}
}
template <size_t N, typename T>
static void TestFPUnaryPredicatedHelper(Test* config,
int src_size_in_bits,
int dst_size_in_bits,
T (&zn_inputs)[N],
const T (&zd_expected)[N],
FPUnaryMFn macro_m,
FPUnaryZFn macro_z) {
uint64_t pg_inputs[] = {0xa55aa55aa55aa55a,
0xa55aa55aa55aa55a,
0xa55aa55aa55aa55a,
0xa55aa55aa55aa55a};
TestFPUnaryPredicatedHelper(config,
src_size_in_bits,
dst_size_in_bits,
zn_inputs,
pg_inputs,
zd_expected,
macro_m,
macro_z);
// The complementary of above predicate to get full input coverage.
uint64_t pg_c_inputs[] = {0x5aa55aa55aa55aa5,
0x5aa55aa55aa55aa5,
0x5aa55aa55aa55aa5,
0x5aa55aa55aa55aa5};
TestFPUnaryPredicatedHelper(config,
src_size_in_bits,
dst_size_in_bits,
zn_inputs,
pg_c_inputs,
zd_expected,
macro_m,
macro_z);
}
template <size_t N, typename T>
static void TestFcvtHelper(Test* config,
int src_size_in_bits,
int dst_size_in_bits,
T (&zn_inputs)[N],
const T (&zd_expected)[N]) {
TestFPUnaryPredicatedHelper(config,
src_size_in_bits,
dst_size_in_bits,
zn_inputs,
zd_expected,
&MacroAssembler::Fcvt, // Merging form.
&MacroAssembler::Fcvt); // Zerging form.
}
TEST_SVE(sve_fcvt) {
uint64_t h_vals[] = {0x7c00,
0xfc00,
0,
0x8000,
0x7bff, // Max half precision.
0x0400, // Min positive normal.
0x03ff, // Max subnormal.
0x0001}; // Min positive subnormal.
uint64_t s_vals[] = {0x7f800000,
0xff800000,
0,
0x80000000,
0x477fe000,
0x38800000,
0x387fc000,
0x33800000};
uint64_t d_vals[] = {0x7ff0000000000000,
0xfff0000000000000,
0,
0x8000000000000000,
0x40effc0000000000,
0x3f10000000000000,
0x3f0ff80000000000,
0x3e70000000000000};
TestFcvtHelper(config, kHRegSize, kSRegSize, h_vals, s_vals);
TestFcvtHelper(config, kSRegSize, kHRegSize, s_vals, h_vals);
TestFcvtHelper(config, kSRegSize, kDRegSize, s_vals, d_vals);
TestFcvtHelper(config, kDRegSize, kSRegSize, d_vals, s_vals);
TestFcvtHelper(config, kHRegSize, kDRegSize, h_vals, d_vals);
TestFcvtHelper(config, kDRegSize, kHRegSize, d_vals, h_vals);
}
TEST_SVE(sve_fcvt_nan) {
uint64_t h_inputs[] = {0x7e55, // Quiet NaN.
0x7c22}; // Signalling NaN.
uint64_t h2s_expected[] = {0x7fcaa000, 0x7fc44000};
uint64_t h2d_expected[] = {0x7ff9540000000000, 0x7ff8880000000000};
uint64_t s_inputs[] = {0x7fc12345, // Quiet NaN.
0x7f812345}; // Signalling NaN.
uint64_t s2h_expected[] = {0x7e09, 0x7e09};
uint64_t s2d_expected[] = {0x7ff82468a0000000, 0x7ff82468a0000000};
uint64_t d_inputs[] = {0x7ffaaaaa22222222, // Quiet NaN.
0x7ff5555511111111}; // Signalling NaN.
uint64_t d2h_expected[] = {0x7eaa, 0x7f55};
uint64_t d2s_expected[] = {0x7fd55551, 0x7feaaaa8};
TestFcvtHelper(config, kHRegSize, kSRegSize, h_inputs, h2s_expected);
TestFcvtHelper(config, kSRegSize, kHRegSize, s_inputs, s2h_expected);
TestFcvtHelper(config, kHRegSize, kDRegSize, h_inputs, h2d_expected);
TestFcvtHelper(config, kDRegSize, kHRegSize, d_inputs, d2h_expected);
TestFcvtHelper(config, kSRegSize, kDRegSize, s_inputs, s2d_expected);
TestFcvtHelper(config, kDRegSize, kSRegSize, d_inputs, d2s_expected);
}
template <size_t N, typename T>
static void TestFrecpxHelper(Test* config,
int lane_size_in_bits,
T (&zn_inputs)[N],
const T (&zd_expected)[N]) {
TestFPUnaryPredicatedHelper(config,
lane_size_in_bits,
lane_size_in_bits,
zn_inputs,
zd_expected,
&MacroAssembler::Frecpx, // Merging form.
&MacroAssembler::Frecpx); // Zerging form.
}
TEST_SVE(sve_frecpx_h) {
uint64_t zn_inputs[] = {Float16ToRawbits(kFP16PositiveInfinity),
Float16ToRawbits(kFP16NegativeInfinity),
Float16ToRawbits(Float16(0.0)),
Float16ToRawbits(Float16(-0.0)),
0x0001, // Smallest positive subnormal number.
0x03ff, // Largest subnormal number.
0x0400, // Smallest positive normal number.
0x7bff, // Largest normal number.
0x3bff, // Largest number less than one.
0x3c01, // Smallest number larger than one.
0x7c22, // Signalling NaN.
0x7e55}; // Quiet NaN.
uint64_t zd_expected[] = {0,
0x8000,
0x7800,
0xf800,
// Exponent of subnormal numbers are zero.
0x7800,
0x7800,
0x7800,
0x0400,
0x4400,
0x4000,
0x7e22, // To quiet NaN.
0x7e55};
TestFrecpxHelper(config, kHRegSize, zn_inputs, zd_expected);
}
TEST_SVE(sve_frecpx_s) {
uint64_t zn_inputs[] = {FloatToRawbits(kFP32PositiveInfinity),
FloatToRawbits(kFP32NegativeInfinity),
FloatToRawbits(65504), // Max half precision.
FloatToRawbits(6.10352e-5), // Min positive normal.
FloatToRawbits(6.09756e-5), // Max subnormal.
FloatToRawbits(
5.96046e-8), // Min positive subnormal.
FloatToRawbits(5e-9), // Not representable -> zero.
FloatToRawbits(-0.0),
FloatToRawbits(0.0),
0x7f952222, // Signalling NaN.
0x7fea2222}; // Quiet NaN;
uint64_t zd_expected[] = {0, // 0.0
0x80000000, // -0.0
0x38800000, // 6.10352e-05
0x47000000, // 32768
0x47800000, // 65536
0x4c800000, // 6.71089e+07
0x4e000000, // 5.36871e+08
0xff000000, // -1.70141e+38
0x7f000000, // 1.70141e+38
0x7fd52222,
0x7fea2222};
TestFrecpxHelper(config, kSRegSize, zn_inputs, zd_expected);
}
TEST_SVE(sve_frecpx_d) {
uint64_t zn_inputs[] = {DoubleToRawbits(kFP64PositiveInfinity),
DoubleToRawbits(kFP64NegativeInfinity),
DoubleToRawbits(65504), // Max half precision.
DoubleToRawbits(6.10352e-5), // Min positive normal.
DoubleToRawbits(6.09756e-5), // Max subnormal.
DoubleToRawbits(
5.96046e-8), // Min positive subnormal.
DoubleToRawbits(5e-9), // Not representable -> zero.
DoubleToRawbits(-0.0),
DoubleToRawbits(0.0),
0x7ff5555511111111, // Signalling NaN.
0x7ffaaaaa11111111}; // Quiet NaN;
uint64_t zd_expected[] = {0, // 0.0
0x8000000000000000, // -0.0
0x3f10000000000000, // 6.10352e-05
0x40e0000000000000, // 32768
0x40f0000000000000, // 65536
0x4190000000000000, // 6.71089e+07
0x41c0000000000000, // 5.36871e+08
0xffe0000000000000, // -1.70141e+38
0x7fe0000000000000, // 1.70141e+38
0x7ffd555511111111,
0x7ffaaaaa11111111};
TestFrecpxHelper(config, kDRegSize, zn_inputs, zd_expected);
}
template <size_t N, typename T>
static void TestFsqrtHelper(Test* config,
int lane_size_in_bits,
T (&zn_inputs)[N],
const T (&zd_expected)[N]) {
TestFPUnaryPredicatedHelper(config,
lane_size_in_bits,
lane_size_in_bits,
zn_inputs,
zd_expected,
&MacroAssembler::Fsqrt, // Merging form.
&MacroAssembler::Fsqrt); // Zerging form.
}
TEST_SVE(sve_fsqrt_h) {
uint64_t zn_inputs[] =
{Float16ToRawbits(Float16(0.0)),
Float16ToRawbits(Float16(-0.0)),
Float16ToRawbits(Float16(1.0)),
Float16ToRawbits(Float16(65025.0)),
Float16ToRawbits(kFP16PositiveInfinity),
Float16ToRawbits(kFP16NegativeInfinity),
Float16ToRawbits(Float16(6.10352e-5)), // Min normal positive.
Float16ToRawbits(Float16(65504.0)), // Max normal positive float.
Float16ToRawbits(Float16(6.09756e-5)), // Max subnormal.
Float16ToRawbits(Float16(5.96046e-8)), // Min subnormal positive.
0x7c22, // Signaling NaN
0x7e55}; // Quiet NaN
uint64_t zd_expected[] = {Float16ToRawbits(Float16(0.0)),
Float16ToRawbits(Float16(-0.0)),
Float16ToRawbits(Float16(1.0)),
Float16ToRawbits(Float16(255.0)),
Float16ToRawbits(kFP16PositiveInfinity),
Float16ToRawbits(kFP16DefaultNaN),
0x2000,
0x5bff,
0x1fff,
0x0c00,
0x7e22, // To quiet NaN.
0x7e55};
TestFsqrtHelper(config, kHRegSize, zn_inputs, zd_expected);
}
TEST_SVE(sve_fsqrt_s) {
uint64_t zn_inputs[] = {FloatToRawbits(0.0f),
FloatToRawbits(-0.0f),
FloatToRawbits(1.0f),
FloatToRawbits(65536.0f),
FloatToRawbits(kFP32PositiveInfinity),
FloatToRawbits(kFP32NegativeInfinity),
0x00800000, // Min normal positive, ~1.17e−38
0x7f7fffff, // Max normal positive, ~3.40e+38
0x00000001, // Min subnormal positive, ~1.40e−45
0x007fffff, // Max subnormal, ~1.17e−38
0x7f951111, // Signaling NaN
0x7fea1111}; // Quiet NaN
uint64_t zd_expected[] = {FloatToRawbits(0.0f),
FloatToRawbits(-0.0f),
FloatToRawbits(1.0f),
FloatToRawbits(256.0f),
FloatToRawbits(kFP32PositiveInfinity),
FloatToRawbits(kFP32DefaultNaN),
0x20000000, // ~1.08e-19
0x5f7fffff, // ~1.84e+19
0x1a3504f3, // ~3.74e-23
0x1fffffff, // ~1.08e-19
0x7fd51111, // To quiet NaN.
0x7fea1111};
TestFsqrtHelper(config, kSRegSize, zn_inputs, zd_expected);
}
TEST_SVE(sve_fsqrt_d) {
uint64_t zn_inputs[] =
{DoubleToRawbits(0.0),
DoubleToRawbits(-0.0),
DoubleToRawbits(1.0),
DoubleToRawbits(65536.0),
DoubleToRawbits(kFP64PositiveInfinity),
DoubleToRawbits(kFP64NegativeInfinity),
0x0010000000000000, // Min normal positive, ~2.22e-308
0x7fefffffffffffff, // Max normal positive, ~1.79e+308
0x0000000000000001, // Min subnormal positive, 5e-324
0x000fffffffffffff, // Max subnormal, ~2.22e-308
0x7ff5555511111111,
0x7ffaaaaa11111111};
uint64_t zd_expected[] = {DoubleToRawbits(0.0),
DoubleToRawbits(-0.0),
DoubleToRawbits(1.0),
DoubleToRawbits(256.0),
DoubleToRawbits(kFP64PositiveInfinity),
DoubleToRawbits(kFP64DefaultNaN),
0x2000000000000000, // ~1.49e-154
0x5fefffffffffffff, // ~1.34e+154
0x1e60000000000000, // ~2.22e-162
0x1fffffffffffffff, // ~1.49e-154
0x7ffd555511111111, // To quiet NaN.
0x7ffaaaaa11111111};
TestFsqrtHelper(config, kDRegSize, zn_inputs, zd_expected);
}
TEST_SVE(sve_adr) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Index(z0.VnD(), 0x10000000f0000000, 0x1000);
__ Index(z1.VnD(), 1, 3);
__ Index(z2.VnS(), -1, -1);
__ Adr(z3.VnD(), SVEMemOperand(z0.VnD(), z1.VnD()));
__ Adr(z4.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 1));
__ Adr(z5.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 2));
__ Adr(z6.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 3));
__ Adr(z7.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW));
__ Adr(z8.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 1));
__ Adr(z9.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 2));
__ Adr(z10.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 3));
__ Adr(z11.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW));
__ Adr(z12.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 1));
__ Adr(z13.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 2));
__ Adr(z14.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 3));
__ Adr(z15.VnS(), SVEMemOperand(z0.VnS(), z2.VnS()));
__ Adr(z16.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 1));
__ Adr(z17.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 2));
__ Adr(z18.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 3));
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z3[] = {0x10000000f0001004, 0x10000000f0000001};
uint64_t expected_z4[] = {0x10000000f0001008, 0x10000000f0000002};
uint64_t expected_z5[] = {0x10000000f0001010, 0x10000000f0000004};
uint64_t expected_z6[] = {0x10000000f0001020, 0x10000000f0000008};
uint64_t expected_z7[] = {0x10000001f0000ffd, 0x10000001efffffff};
uint64_t expected_z8[] = {0x10000002f0000ffa, 0x10000002effffffe};
uint64_t expected_z9[] = {0x10000004f0000ff4, 0x10000004effffffc};
uint64_t expected_z10[] = {0x10000008f0000fe8, 0x10000008effffff8};
uint64_t expected_z11[] = {0x10000000f0000ffd, 0x10000000efffffff};
uint64_t expected_z12[] = {0x10000000f0000ffa, 0x10000000effffffe};
uint64_t expected_z13[] = {0x10000000f0000ff4, 0x10000000effffffc};
uint64_t expected_z14[] = {0x10000000f0000fe8, 0x10000000effffff8};
uint64_t expected_z15[] = {0x0ffffffcf0000ffd, 0x0ffffffeefffffff};
uint64_t expected_z16[] = {0x0ffffff8f0000ffa, 0x0ffffffceffffffe};
uint64_t expected_z17[] = {0x0ffffff0f0000ff4, 0x0ffffff8effffffc};
uint64_t expected_z18[] = {0x0fffffe0f0000fe8, 0x0ffffff0effffff8};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
}
}
// Test loads and broadcast by comparing them with the result of a set of
// equivalent scalar loads.
template <typename F>
static void LoadBcastHelper(Test* config,
unsigned msize_in_bits,
unsigned esize_in_bits,
F sve_ld1,
bool is_signed) {
VIXL_ASSERT((esize_in_bits == kBRegSize) || (esize_in_bits == kHRegSize) ||
(esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
static const unsigned kMaxLaneCount = kZRegMaxSize / kBRegSize;
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
int vl = config->sve_vl_in_bytes();
uint64_t offsets[kMaxLaneCount];
uint64_t buffer_size = vl * 64;
uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
BufferFillingHelper(data,
buffer_size,
msize_in_bytes,
kMaxLaneCount,
offsets);
for (unsigned i = 0; i < (kMaxLaneCount / 2); i++) {
// Assign encodable offsets into the first part of the offset array so
// that both encodable and unencodable offset can be tested.
// Note that the encoding bit range of immediate offset is 6 bits.
offsets[i] = (offsets[i] % (UINT64_C(1) << 6)) * msize_in_bytes;
}
ZRegister zn = z0.WithLaneSize(esize_in_bits);
ZRegister zn_ref = z4.WithLaneSize(esize_in_bits);
PRegisterZ pg = p0.Zeroing();
Initialise(&masm,
pg,
0x9abcdef012345678,
0xabcdef0123456789,
0xf4f3f1f0fefdfcfa,
0xf9f8f6f5f3f2f0ff);
__ Mov(x2, data);
uint64_t enablable_offset = offsets[0];
// Simple check if the operation correct in a single offset.
(masm.*sve_ld1)(zn, pg, SVEMemOperand(x2, enablable_offset));
// Generate a reference result using scalar loads.
uint64_t address = data + enablable_offset;
uint64_t duplicated_addresses[kMaxLaneCount];
for (unsigned i = 0; i < kMaxLaneCount; i++) {
duplicated_addresses[i] = address;
}
ScalarLoadHelper(&masm,
vl,
duplicated_addresses,
zn_ref,
pg,
esize_in_bits,
msize_in_bits,
is_signed);
ZRegister zn_agg = z10.WithLaneSize(esize_in_bits);
ZRegister zn_agg_ref = z11.WithLaneSize(esize_in_bits);
ZRegister zn_temp = z12.WithLaneSize(esize_in_bits);
__ Dup(zn_agg, 0);
__ Dup(zn_agg_ref, 0);
// Check if the operation correct in different offsets.
for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
(masm.*sve_ld1)(zn_temp, pg, SVEMemOperand(x2, offsets[i]));
__ Lastb(x1, pg, zn_temp);
__ Insr(zn_agg, x1);
__ Mov(x3, data + offsets[i]);
ScalarLoadHelper(&masm, x1, x3, msize_in_bits, is_signed);
__ Insr(zn_agg_ref, x1);
}
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zn_ref, zn);
ASSERT_EQUAL_SVE(zn_agg_ref, zn_agg);
}
free(reinterpret_cast<void*>(data));
}
TEST_SVE(sve_ld1rb) {
LoadBcastHelper(config, kBRegSize, kBRegSize, &MacroAssembler::Ld1rb, false);
LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rb, false);
LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rb, false);
LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rb, false);
}
TEST_SVE(sve_ld1rh) {
LoadBcastHelper(config, kHRegSize, kHRegSize, &MacroAssembler::Ld1rh, false);
LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rh, false);
LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rh, false);
}
TEST_SVE(sve_ld1rw) {
LoadBcastHelper(config, kSRegSize, kSRegSize, &MacroAssembler::Ld1rw, false);
LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rw, false);
}
TEST_SVE(sve_ld1rd) {
LoadBcastHelper(config, kDRegSize, kDRegSize, &MacroAssembler::Ld1rd, false);
}
TEST_SVE(sve_ld1rsb) {
LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rsb, true);
LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rsb, true);
LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rsb, true);
}
TEST_SVE(sve_ld1rsh) {
LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rsh, true);
LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rsh, true);
}
TEST_SVE(sve_ld1rsw) {
LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rsw, true);
}
TEST_SVE(sve_prefetch_offset) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Prfb(PLDL1KEEP, p5, SVEMemOperand(z30.VnS(), 0));
__ Prfb(PLDL1STRM, p5, SVEMemOperand(x28, -11, SVE_MUL_VL));
__ Prfb(PLDL2KEEP, p6, SVEMemOperand(x30, x29));
__ Prfb(PLDL2STRM, p6, SVEMemOperand(x7, z12.VnS(), UXTW));
__ Prfh(PSTL2KEEP, p6, SVEMemOperand(z0.VnS(), 28));
__ Prfh(PSTL2STRM, p4, SVEMemOperand(x17, -3, SVE_MUL_VL));
__ Prfh(PSTL3KEEP, p3, SVEMemOperand(x0, x0, LSL, 1));
__ Prfh(PSTL3STRM, p4, SVEMemOperand(x20, z0.VnD(), LSL, 1));
__ Prfw(PLDL1KEEP, p3, SVEMemOperand(z23.VnD(), 5));
__ Prfw(PLDL1STRM, p1, SVEMemOperand(x4, 10, SVE_MUL_VL));
__ Prfw(PLDL2KEEP, p2, SVEMemOperand(x22, x22, LSL, 2));
__ Prfw(PLDL2STRM, p1, SVEMemOperand(x2, z6.VnS(), SXTW, 2));
__ Prfd(PLDL3KEEP, p5, SVEMemOperand(z11.VnD(), 9));
__ Prfd(PLDL3STRM, p3, SVEMemOperand(x0, -24, SVE_MUL_VL));
__ Prfd(PSTL1KEEP, p7, SVEMemOperand(x5, x5, LSL, 3));
__ Prfd(PSTL1STRM, p1, SVEMemOperand(x19, z18.VnS(), SXTW, 3));
END();
if (CAN_RUN()) {
RUN();
}
}
TEST_SVE(sve2_match_nmatch) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
__ Ptrue(p0.VnB());
__ Ptrue(p1.VnH());
__ Ptrue(p2.VnS());
// Vector to search is bytes 0 - 7, repeating every eight bytes.
__ Index(z0.VnB(), 0, 1);
__ Dup(z0.VnD(), z0.VnD(), 0);
// Elements to find are (repeated) bytes 0 - 3 in the first segment, 4 - 7
// in the second, 8 - 11 in the third, etc.
__ Index(z1.VnB(), 0, 1);
__ Lsr(z1.VnB(), z1.VnB(), 2);
__ Match(p3.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
__ Match(p4.VnB(), p1.Zeroing(), z0.VnB(), z1.VnB());
__ Nmatch(p0.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
__ Uunpklo(z0.VnH(), z0.VnB());
__ Uunpklo(z1.VnH(), z1.VnB());
__ Match(p5.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
__ Match(p6.VnH(), p2.Zeroing(), z0.VnH(), z1.VnH());
__ Nmatch(p1.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
END();
if (CAN_RUN()) {
RUN();
int p3_exp[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
int p4_exp[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
int p0_exp[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1};
ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
int p1_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0};
ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
}
}
TEST_SVE(sve2_saba_uaba) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
__ Index(z0.VnB(), 0, 1);
__ Dup(z1.VnB(), 0xff);
__ Dup(z2.VnB(), 1);
__ Uaba(z2.VnB(), z2.VnB(), z0.VnB(), z1.VnB());
__ Index(z0.VnB(), 0, -1);
__ Index(z3.VnH(), 0, 1);
__ Index(z4.VnH(), 1, 1);
__ Uaba(z3.VnH(), z3.VnH(), z3.VnH(), z4.VnH());
__ Index(z5.VnS(), 3, 6);
__ Index(z6.VnS(), 5, 6);
__ Uaba(z5.VnS(), z5.VnS(), z5.VnS(), z6.VnS());
__ Index(z7.VnD(), 424, 12);
__ Index(z8.VnD(), 4242, 12);
__ Uaba(z7.VnD(), z7.VnD(), z7.VnD(), z8.VnD());
__ Index(z9.VnH(), -1, -1);
__ Dup(z10.VnB(), 0);
__ Saba(z10.VnB(), z10.VnB(), z9.VnB(), z10.VnB());
__ Index(z11.VnH(), 0x0101, 1);
__ Index(z12.VnH(), 0, 1);
__ Index(z13.VnH(), 0, -1);
__ Saba(z13.VnH(), z13.VnH(), z12.VnH(), z13.VnH());
__ Index(z14.VnS(), 0, 2);
__ Index(z15.VnS(), 0, -2);
__ Saba(z15.VnS(), z15.VnS(), z14.VnS(), z15.VnS());
__ Index(z16.VnD(), 0, 42);
__ Index(z17.VnD(), 0, -42);
__ Saba(z17.VnD(), z17.VnD(), z16.VnD(), z17.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z0, z2);
ASSERT_EQUAL_SVE(z3, z4);
ASSERT_EQUAL_SVE(z5, z6);
ASSERT_EQUAL_SVE(z7, z8);
ASSERT_EQUAL_SVE(z10, z11);
ASSERT_EQUAL_SVE(z12, z13);
ASSERT_EQUAL_SVE(z14, z15);
ASSERT_EQUAL_SVE(z16, z17);
}
}
TEST_SVE(sve2_integer_multiply_long_vector) {
// The test just check Sqdmull[b|t] and Pmull[b|t], as the way how the element
// operating of the other instructions in the group are likewise.
int32_t zn_inputs_s[] =
{1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};
int32_t zm_inputs_s[] =
{1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
int64_t sqdmullb_vec_expected_d[] =
{-8, -32, -72, -128, RawbitsToInt64(0x8000000100000000), INT64_MAX};
uint64_t sqdmullt_vec_expected_d[] =
{2, 18, 50, 98, 0x8000000100000000, 0x7ffffffe00000002};
uint64_t pmullb_vec_expected_d[] = {0x00000001fffffffc,
0x00000003fffffff0,
0x000000020000001c,
0x00000007ffffffc0,
0x3fffffff80000000,
0x4000000000000000};
uint64_t pmullt_vec_expected_d[] = {0x05,
0x11,
0x15,
0x3fffffff80000000,
0x1555555555555555};
uint64_t sqdmullb_idx_expected_d[] = {0xfffffffffffffff8,
0xfffffffffffffff0,
0xffffffffffffffb8,
0xffffffffffffffa0,
0x8000000100000000,
INT64_MAX};
uint64_t sqdmullt_idx_expected_d[] =
{8, // 2 * zn[11] * zm[8] = 2 * 4 * 1
24, // 2 * zn[9] * zm[8] = 2 * 4 * 3
80, // 2 * zn[7] * zm[4] = 2 * 8 * 5
112, // 2 * zn[5] * zm[4] = 2 * 8 * 7
0x7fffffffffffffff, // 2 * zn[3] * zm[0]
0x8000000100000000}; // 2 * zn[1] * zm[0]
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
InsrHelper(&masm, z31.VnS(), zn_inputs_s);
InsrHelper(&masm, z30.VnS(), zm_inputs_s);
__ Sqdmullb(z1.VnD(), z31.VnS(), z30.VnS());
__ Sqdmullt(z2.VnD(), z31.VnS(), z30.VnS());
__ Pmullb(z3.VnD(), z31.VnS(), z30.VnS());
__ Pmullt(z4.VnD(), z31.VnS(), z30.VnS());
__ Mov(z7, z30);
__ Mov(z8, z31);
__ Sqdmullb(z5.VnD(), z8.VnS(), z7.VnS(), 2);
__ Sqdmullt(z6.VnD(), z8.VnS(), z7.VnS(), 0);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(sqdmullb_vec_expected_d, z1.VnD());
ASSERT_EQUAL_SVE(sqdmullt_vec_expected_d, z2.VnD());
ASSERT_EQUAL_SVE(pmullb_vec_expected_d, z3.VnD());
ASSERT_EQUAL_SVE(pmullt_vec_expected_d, z4.VnD());
ASSERT_EQUAL_SVE(sqdmullb_idx_expected_d, z5.VnD());
ASSERT_EQUAL_SVE(sqdmullt_idx_expected_d, z6.VnD());
}
}
TEST_SVE(sve2_integer_multiply_add_long_vector) {
int32_t zn_inputs_s[] =
{1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};
int32_t zm_inputs_s[] =
{1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
int64_t sqdmlalb_vec_expected_d[] =
{-3, -28, -69, -126, RawbitsToInt64(0x8000000100000001), INT64_MAX};
int64_t sqdmlalt_vec_expected_d[] = {-3,
14,
47,
96,
RawbitsToInt64(0x80000000ffffffff),
static_cast<int64_t>(
0x7ffffffe00000002)};
int64_t sqdmlalb_idx_expected_d[] =
{-11, // za.d[5] + 2 * zn.s[10] * zm.s[8] = 5 + 2 * -2 * 4
-28, // za.d[4] + 2 * zn.s[8] * zm.s[8] = 4 + 2 * -4 * 4
-93, // za.d[3] + 2 * zn.s[6] * zm.s[4] = 3 + 2 * -6 * 8
-126, // za.d[2] + 2 * zn.s[4] * zm.s[4] = 2 + 2 * -8 * 8
RawbitsToInt64(0x8000000100000001),
INT64_MAX};
int64_t sqdmlalt_idx_expected_d[] =
{1, // za.d[5] + 2 * zn.s[11] * zm.s[9] = -5 + 2 * 1 * 3
14, // za.d[4] + 2 * zn.s[9] * zm.s[9] = -4 + 2 * 3 * 3
67, // za.d[3] + 2 * zn.s[7] * zm.s[5] = -3 + 2 * 5 * 7
96, // za.d[2] + 2 * zn.s[5] * zm.s[5] = -2 + 2 * 7 * 7
RawbitsToInt64(0x80000000ffffffff),
static_cast<int64_t>(0x7ffffffe00000002)};
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
InsrHelper(&masm, z0.VnS(), zn_inputs_s);
InsrHelper(&masm, z1.VnS(), zm_inputs_s);
__ Index(z2.VnD(), 0, 1);
__ Index(z3.VnD(), 0, -1);
__ Mov(z31, z2);
__ Sqdmlalb(z31.VnD(), z31.VnD(), z0.VnS(), z1.VnS());
__ Mov(z30, z3);
__ Sqdmlalt(z30.VnD(), z30.VnD(), z0.VnS(), z1.VnS());
__ Mov(z29, z31);
__ Sqdmlslb(z29.VnD(), z29.VnD(), z0.VnS(), z1.VnS());
__ Mov(z28, z30);
__ Sqdmlslt(z28.VnD(), z28.VnD(), z0.VnS(), z1.VnS());
__ Sqdmlalb(z27.VnD(), z2.VnD(), z0.VnS(), z1.VnS());
__ Sqdmlalt(z26.VnD(), z3.VnD(), z0.VnS(), z1.VnS());
__ Sqdmlslb(z25.VnD(), z27.VnD(), z0.VnS(), z1.VnS());
__ Sqdmlslt(z24.VnD(), z26.VnD(), z0.VnS(), z1.VnS());
__ Mov(z23, z2);
__ Sqdmlalb(z23.VnD(), z23.VnD(), z0.VnS(), z1.VnS(), 0);
__ Mov(z22, z3);
__ Sqdmlalt(z22.VnD(), z22.VnD(), z0.VnS(), z1.VnS(), 1);
__ Mov(z21, z23);
__ Sqdmlslb(z21.VnD(), z21.VnD(), z0.VnS(), z1.VnS(), 0);
__ Mov(z20, z22);
__ Sqdmlslt(z20.VnD(), z20.VnD(), z0.VnS(), z1.VnS(), 1);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(sqdmlalb_vec_expected_d, z31.VnD());
ASSERT_EQUAL_SVE(sqdmlalt_vec_expected_d, z30.VnD());
ASSERT_EQUAL_SVE(z2, z29);
ASSERT_EQUAL_SVE(z3, z28);
ASSERT_EQUAL_SVE(z31, z27);
ASSERT_EQUAL_SVE(z30, z26);
ASSERT_EQUAL_SVE(z29, z25);
ASSERT_EQUAL_SVE(z28, z24);
ASSERT_EQUAL_SVE(sqdmlalb_idx_expected_d, z23.VnD());
ASSERT_EQUAL_SVE(sqdmlalt_idx_expected_d, z22.VnD());
ASSERT_EQUAL_SVE(z2, z21);
ASSERT_EQUAL_SVE(z3, z20);
}
}
TEST_SVE(sve2_ldnt1) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
int data_size = kZRegMaxSizeInBytes * 4;
uint8_t* data = new uint8_t[data_size];
for (int i = 0; i < data_size; i++) {
data[i] = i & 0xff;
}
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
__ Index(z30.VnD(), x0, 1);
__ Ptrue(p0.VnB());
__ Punpklo(p1.VnH(), p0.VnB());
__ Punpklo(p2.VnH(), p1.VnB());
__ Punpklo(p3.VnH(), p2.VnB());
__ Punpklo(p4.VnH(), p3.VnB());
__ Mov(x1, 1);
__ Ldnt1b(z0.VnD(), p1.Zeroing(), SVEMemOperand(z30.VnD(), x1));
__ Ld1b(z1.VnD(), p1.Zeroing(), SVEMemOperand(x1, z30.VnD()));
__ Mov(x1, -4);
__ Ldnt1h(z2.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
__ Ld1h(z3.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));
__ Mov(x1, 16);
__ Ldnt1w(z4.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
__ Ld1w(z5.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));
__ Mov(x1, -16);
__ Ldnt1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(z30.VnD(), x1));
__ Ld1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x1, z30.VnD()));
__ Mov(x1, 1);
__ Ldnt1sb(z8.VnD(), p0.Zeroing(), SVEMemOperand(z30.VnD(), x1));
__ Ld1sb(z9.VnD(), p0.Zeroing(), SVEMemOperand(x1, z30.VnD()));
__ Mov(x1, -4);
__ Ldnt1sh(z10.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
__ Ld1sh(z11.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));
__ Mov(x1, 16);
__ Ldnt1sw(z12.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
__ Ld1sw(z13.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z0, z1);
ASSERT_EQUAL_SVE(z2, z3);
ASSERT_EQUAL_SVE(z4, z5);
ASSERT_EQUAL_SVE(z6, z7);
ASSERT_EQUAL_SVE(z8, z9);
ASSERT_EQUAL_SVE(z10, z11);
ASSERT_EQUAL_SVE(z12, z13);
}
}
TEST_SVE(sve2_stnt1) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
int data_size = kZRegMaxSizeInBytes * 4;
uint8_t* data = new uint8_t[data_size];
// Set the base half-way through the buffer so we can use negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
__ Ptrue(p0.VnB());
__ Punpklo(p1.VnH(), p0.VnB());
__ Punpklo(p2.VnH(), p1.VnB());
__ Punpklo(p3.VnH(), p2.VnB());
__ Punpklo(p4.VnH(), p3.VnB());
__ Dup(z0.VnB(), 0xaa);
__ Dup(z1.VnB(), 0x55);
__ Rdvl(x1, 1);
__ Mov(x3, 0);
// Put store addresses into z30, and a small offset in x4.
__ Index(z30.VnD(), x0, 1);
__ Mov(x4, 2);
// Store an entire vector of 0xaa to the buffer, then a smaller scatter store
// of 0x55 using Stnt1b.
__ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
__ Stnt1b(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
// Load the entire vector back from the buffer.
__ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
// Construct a predicate that reflects the number of bytes stored by Stnt1b,
// based on the current VL, and use Sel to obtain a reference vector for
// comparison.
__ Lsr(x2, x1, 3);
__ Whilelo(p5.VnB(), x3, x2);
__ Sel(z3.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
// Repeat for larger element sizes.
__ Mov(x4, -4);
__ Index(z30.VnD(), x0, 2);
__ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
__ Stnt1h(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
__ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
__ Lsr(x2, x1, 2);
__ Whilelo(p5.VnB(), x3, x2);
__ Sel(z5.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
__ Mov(x4, 16);
__ Index(z30.VnD(), x0, 4);
__ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
__ Stnt1w(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
__ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
__ Lsr(x2, x1, 1);
__ Whilelo(p5.VnB(), x3, x2);
__ Sel(z7.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
__ Mov(x4, -16);
__ Index(z30.VnD(), x0, 8);
__ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
__ Stnt1d(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
__ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
__ Whilelo(p5.VnB(), x3, x1);
__ Sel(z9.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z2, z3);
ASSERT_EQUAL_SVE(z4, z5);
ASSERT_EQUAL_SVE(z6, z7);
ASSERT_EQUAL_SVE(z8, z9);
}
}
TEST_SVE(sve2_while_simple) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
__ Mov(x0, 1);
__ Mov(x1, 0);
__ Mov(x2, 3);
__ Whilehi(p0.VnB(), x0, x1);
__ Whilehs(p1.VnB(), x0, x1);
__ Whilehi(p2.VnB(), x2, x1);
__ Whilehs(p3.VnB(), x2, x1);
__ Whilehi(p4.VnB(), x2, x0);
__ Whilehs(p5.VnB(), x2, x0);
__ Whilegt(p6.VnB(), x0, x1);
__ Whilege(p7.VnB(), x0, x1);
__ Whilegt(p8.VnB(), x2, x1);
__ Whilege(p9.VnB(), x2, x1);
__ Whilegt(p10.VnB(), x2, x0);
__ Whilege(p11.VnB(), x2, x0);
__ Mov(x4, 0x80000000);
__ Mov(x5, 0x80000001);
__ Whilege(p12.VnB(), w5, w4);
__ Whilegt(p13.VnB(), w5, w4);
__ Mov(x6, 0x8000000000000000);
__ Mov(x7, 0x8000000000000001);
__ Whilege(p14.VnB(), x7, x6);
__ Whilegt(p15.VnB(), x7, x6);
for (int i = 0; i < 16; i++) {
__ Rev(PRegister(i).VnB(), PRegister(i).VnB());
}
END();
if (CAN_RUN()) {
RUN();
int p0_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int p1_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
int p8_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
int p9_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
int p10_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
int p11_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
int p12_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
int p13_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
int p14_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
int p15_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
ASSERT_EQUAL_SVE(p9_exp, p9.VnB());
ASSERT_EQUAL_SVE(p10_exp, p10.VnB());
ASSERT_EQUAL_SVE(p11_exp, p11.VnB());
ASSERT_EQUAL_SVE(p12_exp, p12.VnB());
ASSERT_EQUAL_SVE(p13_exp, p13.VnB());
ASSERT_EQUAL_SVE(p14_exp, p14.VnB());
ASSERT_EQUAL_SVE(p15_exp, p15.VnB());
}
}
TEST_SVE(sve2_whilerw_whilewr_simple) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
__ Mov(x0, 0);
__ Mov(x1, 1);
__ Mov(x2, 3);
__ Whilerw(p0.VnB(), x0, x0);
__ Whilerw(p1.VnB(), x0, x1);
__ Whilerw(p2.VnB(), x1, x0);
__ Whilewr(p3.VnB(), x0, x0);
__ Whilewr(p4.VnB(), x0, x1);
__ Whilewr(p5.VnB(), x1, x0);
__ Whilewr(p6.VnH(), x1, x1);
__ Whilewr(p7.VnH(), x1, x2);
__ Whilewr(p8.VnH(), x2, x1);
END();
if (CAN_RUN()) {
RUN();
int p0_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
int p1_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
int p5_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
int p6_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
int p8_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
}
}
TEST_SVE(sve2_sqrdcmlah) {
int32_t zn_inputs[] = {-1, -2, -3, -4, 1, 2, 3, 4};
int32_t zm_inputs[] = {-1, -2, 3, 4, 1, 2, -3, -4};
int32_t za_inputs[] = {1, 2, 3, 4, 5, 6, 7, 8};
int32_t zd_000_expected[] =
{1025, 2050, -6141, -8188, 1029, 2054, -6137, -8184};
int32_t zd_090_expected[] =
{1025, -510, -6141, 4612, 1029, -506, -6137, 4616};
int32_t zd_180_expected[] =
{-1023, -2046, 6147, 8196, -1019, -2042, 6151, 8200};
int32_t zd_270_expected[] =
{-1023, 514, 6147, -4604, -1019, 518, 6151, -4600};
int32_t zd_0_270_expected[] =
{2049, -1534, 6147, -4604, 2053, -1530, 6151, -4600};
int32_t zd_3_090_expected[] =
{1025, -510, 3075, -1532, 1029, -506, 3079, -1528};
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
InsrHelper(&masm, z0.VnS(), zn_inputs);
InsrHelper(&masm, z1.VnS(), zm_inputs);
InsrHelper(&masm, z31.VnS(), za_inputs);
// When the value in operands is small, shift left a random value so that it
// can affect the result in destination.
int shift = 20;
__ Lsl(z0.VnS(), z0.VnS(), shift);
__ Lsl(z1.VnS(), z1.VnS(), shift);
__ Mov(z10, z31);
__ Sqrdcmlah(z10.VnS(), z10.VnS(), z0.VnS(), z1.VnS(), 0);
__ Mov(z11, z31);
__ Sqrdcmlah(z11.VnS(), z11.VnS(), z0.VnS(), z1.VnS(), 90);
__ Mov(z12, z31);
__ Sqrdcmlah(z12.VnS(), z12.VnS(), z0.VnS(), z1.VnS(), 180);
__ Mov(z13, z31);
__ Sqrdcmlah(z13.VnS(), z13.VnS(), z0.VnS(), z1.VnS(), 270);
__ Sqrdcmlah(z14.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 0);
__ Sqrdcmlah(z15.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 90);
__ Sqrdcmlah(z16.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 180);
__ Sqrdcmlah(z17.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 270);
__ Mov(z18, z31);
__ Sqrdcmlah(z18.VnS(), z18.VnS(), z0.VnS(), z1.VnS(), 0, 270);
__ Mov(z19, z31);
__ Sqrdcmlah(z19.VnS(), z19.VnS(), z0.VnS(), z1.VnS(), 1, 90);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zd_000_expected, z10.VnS());
ASSERT_EQUAL_SVE(zd_090_expected, z11.VnS());
ASSERT_EQUAL_SVE(zd_180_expected, z12.VnS());
ASSERT_EQUAL_SVE(zd_270_expected, z13.VnS());
ASSERT_EQUAL_SVE(z14, z10);
ASSERT_EQUAL_SVE(z15, z11);
ASSERT_EQUAL_SVE(z16, z12);
ASSERT_EQUAL_SVE(z17, z13);
ASSERT_EQUAL_SVE(zd_0_270_expected, z18.VnS());
ASSERT_EQUAL_SVE(zd_3_090_expected, z19.VnS());
}
}
TEST_SVE(sve2_sqrdmlah) {
uint16_t zn_inputs_h[] = {0x7ffe, 0x7ffd, 0x7ffd, 0x7ffd, 0x8000,
0x7fff, 0x7ffe, 0x7ffe, 0x8001, 0x8000,
0x7ffd, 0x7ffd, 0x7ffd, 0x5555, 0x5555,
0x5555, 0x8000, 0x8000, 0xaaaa, 0x8001};
uint16_t zm_inputs_h[] = {0x7ffd, 0x7fff, 0x7ffe, 0x7ffd, 0x8001,
0x7fff, 0x7fff, 0x7ffe, 0x8000, 0x8000,
0xaaaa, 0x0001, 0x0001, 0xaaaa, 0xaaaa,
0xcccc, 0x8000, 0x8000, 0x8000, 0x8001};
uint16_t za_inputs_h[] = {0x1010, 0x1010, 0x1010, 0x1010, 0x1010,
0x1010, 0x1010, 0x1010, 0x8000, 0x8011,
0x8006, 0xff7d, 0xfeff, 0xaabc, 0xaabb,
0x9c72, 0x8000, 0x0000, 0x8000, 0xffff};
uint16_t zd_expected_h[] = {0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
0x7fff, 0x7fff, 0x7fff, 0xffff, 0x0011,
0x8000, 0xff7e, 0xff00, 0x8000, 0x8000,
0x8000, 0x0000, 0x7fff, 0xd556, 0x7ffd};
uint32_t zn_inputs_s[] = {0x04000000,
0x80000000,
0x04000000,
0x80000000,
0x80000000,
0x80000001,
0x7fffffff,
0x80000000,
0x7ffffffe,
0x7ffffffd,
0x7ffffffd,
0x7ffffffd};
uint32_t zm_inputs_s[] = {0x00000020,
0x80000000,
0x00000010,
0x80000000,
0x7fffffff,
0x80000000,
0x80000000,
0x80000001,
0x7ffffffd,
0x7fffffff,
0x7ffffffe,
0x7ffffffd};
uint32_t za_inputs_s[] = {0x00000000,
0x00000000,
0x00000020,
0x00108000,
0x00000000,
0x00000001,
0x00000000,
0x00000001,
0x10101010,
0x10101010,
0x10101010,
0x10101010};
uint32_t zd_expected_s[] = {0x00000001,
0x7fffffff,
0x00000021,
0x7fffffff,
0x80000001,
0x7fffffff,
0x80000001,
0x7fffffff,
0x7fffffff,
0x7fffffff,
0x7fffffff,
0x7fffffff};
uint64_t zn_inputs_d[] = {0x0400000000000000, 0x8000000000000000,
0x0400000000000000, 0x8000000000000000,
0x8000000000000000, 0x8000000000000001,
0x7fffffffffffffff, 0x8000000000000000,
0x7ffffffffffffffe, 0x7ffffffffffffffd,
0x7ffffffffffffffd, 0x7ffffffffffffffd,
0xf1299accc9186169, 0xd529d2675ee9da21,
0x1a10b5d60b92dcf9, 0xfb1d358e0e6455b1,
0x8eb7721078bdc589, 0x4171509750ded141,
0x8eb7721078bdc589, 0x4171509750ded141};
uint64_t zm_inputs_d[] = {0x0000000000000020, 0x8000000000000000,
0x0000000000000010, 0x8000000000000000,
0x7fffffffffffffff, 0x8000000000000000,
0x8000000000000000, 0x8000000000000001,
0x7ffffffffffffffd, 0x7fffffffffffffff,
0x7ffffffffffffffe, 0x7ffffffffffffffd,
0x30b940efe73f180e, 0x3bc1ff1e52a99b66,
0x40de5c9793535a5e, 0x24752faf47bdddb6,
0x162663016b07e5ae, 0x1de34b56f3d22006,
0x8eb7721078bdc589, 0x4171509750ded141};
uint64_t za_inputs_d[] = {0x0000000000000000, 0x0000000000000000,
0x0000000000000020, 0x0010108000000000,
0x0000000000000000, 0x0000000000000001,
0x0000000000000000, 0x0000000000000001,
0x1010101010101010, 0x1010101010101010,
0x1010101010101010, 0x1010101010101010,
0xb18253371b2c2c77, 0xa70de31e6645eaef,
0xda817198c0318487, 0x9fd9e6b8e04b42ff,
0xced1f6b7119ab197, 0x01ae051a85509b0f,
0x01a211e9352f7927, 0x7667b70a5b13749f};
uint64_t zd_expected_d[] = {0x0000000000000001, 0x7fffffffffffffff,
0x0000000000000021, 0x7fffffffffffffff,
0x8000000000000001, 0x7fffffffffffffff,
0x8000000000000001, 0x7fffffffffffffff,
0x7fffffffffffffff, 0x7fffffffffffffff,
0x7fffffffffffffff, 0x7fffffffffffffff,
0xabdc73dea0d72a35, 0x930e3dc877301966,
0xe7b7145a059f8a9f, 0x9e75a4a9d10cf8af,
0xbb378528642d2581, 0x10f5e6d693ffddf3,
0x65e455a46adc091c, 0x7fffffffffffffff};
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
InsrHelper(&masm, z0.VnH(), zn_inputs_h);
InsrHelper(&masm, z1.VnH(), zm_inputs_h);
InsrHelper(&masm, z2.VnH(), za_inputs_h);
__ Sqrdmlah(z2.VnH(), z2.VnH(), z0.VnH(), z1.VnH());
InsrHelper(&masm, z3.VnS(), zn_inputs_s);
InsrHelper(&masm, z4.VnS(), zm_inputs_s);
InsrHelper(&masm, z5.VnS(), za_inputs_s);
__ Sqrdmlah(z5.VnS(), z5.VnS(), z3.VnS(), z4.VnS());
InsrHelper(&masm, z6.VnD(), zn_inputs_d);
InsrHelper(&masm, z7.VnD(), zm_inputs_d);
InsrHelper(&masm, z8.VnD(), za_inputs_d);
__ Sqrdmlah(z8.VnD(), z8.VnD(), z6.VnD(), z7.VnD());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zd_expected_h, z2.VnH());
ASSERT_EQUAL_SVE(zd_expected_s, z5.VnS());
ASSERT_EQUAL_SVE(zd_expected_d, z8.VnD());
}
}
TEST_SVE(sve2_cmla) {
int32_t zn_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
int32_t zm_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
int32_t zda_inputs_s[] = {1, 2, 3, 4, 5, 6, 7, 8};
int32_t zd_000_expected[] = {9, 18, 51, 68, 13, 22, 55, 72};
int32_t zd_090_expected[] = {9, -2, 51, -32, 13, 2, 55, -28};
int32_t zd_180_expected[] = {-7, -14, -45, -60, -3, -10, -41, -56};
int32_t zd_270_expected[] = {-7, 6, -45, 40, -3, 10, -41, 44};
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
InsrHelper(&masm, z31.VnS(), zn_inputs_s);
InsrHelper(&masm, z30.VnS(), zm_inputs_s);
InsrHelper(&masm, z0.VnS(), zda_inputs_s);
__ Mov(z29, z0);
__ Cmla(z0.VnS(), z0.VnS(), z31.VnS(), z30.VnS(), 0);
InsrHelper(&masm, z1.VnS(), zda_inputs_s);
__ Mov(z28, z1);
__ Cmla(z1.VnS(), z1.VnS(), z31.VnS(), z30.VnS(), 90);
InsrHelper(&masm, z2.VnS(), zda_inputs_s);
__ Mov(z27, z2);
__ Cmla(z2.VnS(), z2.VnS(), z31.VnS(), z30.VnS(), 180);
InsrHelper(&masm, z3.VnS(), zda_inputs_s);
__ Mov(z26, z3);
__ Cmla(z3.VnS(), z3.VnS(), z31.VnS(), z30.VnS(), 270);
__ Cmla(z4.VnS(), z29.VnS(), z31.VnS(), z30.VnS(), 0);
__ Cmla(z5.VnS(), z28.VnS(), z31.VnS(), z30.VnS(), 90);
__ Cmla(z6.VnS(), z27.VnS(), z31.VnS(), z30.VnS(), 180);
__ Cmla(z7.VnS(), z26.VnS(), z31.VnS(), z30.VnS(), 270);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(zd_000_expected, z0.VnS());
ASSERT_EQUAL_SVE(zd_090_expected, z1.VnS());
ASSERT_EQUAL_SVE(zd_180_expected, z2.VnS());
ASSERT_EQUAL_SVE(zd_270_expected, z3.VnS());
ASSERT_EQUAL_SVE(z4, z0);
ASSERT_EQUAL_SVE(z5, z1);
ASSERT_EQUAL_SVE(z6, z2);
ASSERT_EQUAL_SVE(z7, z3);
}
}
TEST_SVE(sve2_integer_saturating_multiply_add_long) {
int32_t zn_bottom_inputs[] =
{-2, -4, -6, -8, INT32_MAX, INT32_MIN, INT32_MIN};
int32_t zm_top_inputs[] = {1, 3, 5, 7, INT32_MAX, INT32_MAX, INT32_MIN};
int64_t sqdmlalbt_expected[] = {2,
-19,
-56,
-109,
static_cast<int64_t>(0x7ffffffe00000004),
RawbitsToInt64(0x8000000100000001),
INT64_MAX};
int64_t sqdmlslbt_expected[] = {-2,
19,
56,
109,
RawbitsToInt64(0x80000001fffffffc),
static_cast<int64_t>(0x7ffffffeffffffff),
RawbitsToInt64(0x8000000000000001)};
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
InsrHelper(&masm, z31.VnS(), zn_bottom_inputs);
InsrHelper(&masm, z30.VnS(), zm_top_inputs);
__ Dup(z29.VnD(), 0);
__ Zip1(z31.VnS(), z31.VnS(), z29.VnS());
__ Zip1(z30.VnS(), z29.VnS(), z30.VnS());
// Initialise inputs for za.
__ Index(z1.VnD(), 0, 1);
__ Index(z2.VnD(), 0, -1);
__ Sqdmlalbt(z1.VnD(), z1.VnD(), z31.VnS(), z30.VnS());
__ Sqdmlslbt(z2.VnD(), z2.VnD(), z31.VnS(), z30.VnS());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(sqdmlalbt_expected, z1.VnD());
ASSERT_EQUAL_SVE(sqdmlslbt_expected, z2.VnD());
}
}
TEST_SVE(sve2_floating_point_multiply_add_long_vector) {
uint16_t zn_inputs[] = {Float16ToRawbits(Float16(1000)),
Float16ToRawbits(Float16(2000)),
Float16ToRawbits(Float16(0.5)),
Float16ToRawbits(Float16(-0.5)),
Float16ToRawbits(Float16(14)),
Float16ToRawbits(Float16(-14)),
Float16ToRawbits(kFP16PositiveInfinity),
Float16ToRawbits(kFP16NegativeInfinity)};
uint16_t zm_inputs[] = {Float16ToRawbits(Float16(10)),
Float16ToRawbits(Float16(-10)),
Float16ToRawbits(Float16(10)),
Float16ToRawbits(Float16(-10)),
Float16ToRawbits(Float16(10)),
Float16ToRawbits(Float16(-10)),
Float16ToRawbits(Float16(10)),
Float16ToRawbits(Float16(-10))};
uint32_t za_inputs[] = {FloatToRawbits(1.0f),
FloatToRawbits(-1.0f),
FloatToRawbits(1.0f),
FloatToRawbits(-1.0f)};
uint32_t fmlalb_zd_expected[] = {0xc69c3e00, // -19999
0x40800000, // 4
0x430d0000, // 141
FloatToRawbits(kFP32PositiveInfinity)};
uint32_t fmlalt_zd_expected[] = {0x461c4400, // 10001
0x40800000, // 4
0x430d0000, // 141
FloatToRawbits(kFP32PositiveInfinity)};
uint32_t fmlslb_zd_expected[] = {0x469c4200, // 20001
0xc0c00000, // -6
0xc30b0000, // -139
FloatToRawbits(kFP32NegativeInfinity)};
uint32_t fmlslt_zd_expected[] = {0xc61c3c00, // -9999
0xc0c00000, // -6
0xc30b0000, // -139
FloatToRawbits(kFP32NegativeInfinity)};
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
InsrHelper(&masm, z31.VnH(), zn_inputs);
InsrHelper(&masm, z30.VnH(), zm_inputs);
InsrHelper(&masm, z29.VnS(), za_inputs);
__ Mov(z0, z29);
__ Fmlalb(z0.VnS(), z0.VnS(), z31.VnH(), z30.VnH());
__ Mov(z1, z29);
__ Fmlalt(z1.VnS(), z1.VnS(), z31.VnH(), z30.VnH());
__ Mov(z2, z29);
__ Fmlslb(z2.VnS(), z2.VnS(), z31.VnH(), z30.VnH());
__ Mov(z3, z29);
__ Fmlslt(z3.VnS(), z3.VnS(), z31.VnH(), z30.VnH());
__ Fmlalb(z4.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
__ Fmlalt(z5.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
__ Fmlslb(z6.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
__ Fmlslt(z7.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(fmlalb_zd_expected, z0.VnS());
ASSERT_EQUAL_SVE(fmlalt_zd_expected, z1.VnS());
ASSERT_EQUAL_SVE(fmlslb_zd_expected, z2.VnS());
ASSERT_EQUAL_SVE(fmlslt_zd_expected, z3.VnS());
ASSERT_EQUAL_SVE(z4, z0);
ASSERT_EQUAL_SVE(z5, z1);
ASSERT_EQUAL_SVE(z6, z2);
ASSERT_EQUAL_SVE(z7, z3);
}
}
TEST_SVE(sve2_flogb_simple) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
START();
__ Ptrue(p0.VnB());
__ Index(z0.VnS(), -4, 1);
__ Mov(z1.VnS(), 0);
__ Mov(z2.VnD(), 0x000fffffffffffff);
__ Mov(z3.VnD(), 0x0010000000000000);
__ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
__ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
__ Fdiv(z1.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
__ Flogb(z0.VnS(), p0.Merging(), z0.VnS());
__ Flogb(z1.VnS(), p0.Merging(), z1.VnS());
__ Flogb(z2.VnD(), p0.Merging(), z2.VnD());
__ Flogb(z3.VnD(), p0.Merging(), z3.VnD());
END();
if (CAN_RUN()) {
RUN();
uint64_t expected_z0[] = {0x0000000200000002,
0x0000000200000002,
0x0000000100000001,
0x0000000080000000,
0x0000000000000001,
0x0000000100000002};
ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
uint64_t expected_z1[] = {0x7fffffff7fffffff,
0x7fffffff7fffffff,
0x7fffffff7fffffff,
0x7fffffff80000000,
0x7fffffff7fffffff,
0x7fffffff7fffffff};
ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
uint64_t expected_z2[] = {0xfffffffffffffc01,
0xfffffffffffffc01,
0xfffffffffffffc01,
0xfffffffffffffc01};
ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
uint64_t expected_z3[] = {0xfffffffffffffc02,
0xfffffffffffffc02,
0xfffffffffffffc02,
0xfffffffffffffc02};
ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
}
}
TEST_SVE(neon_matmul) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
CPUFeatures::kSVEI8MM,
CPUFeatures::kNEON,
CPUFeatures::kI8MM);
// Test Neon integer matrix multiply against SVE.
START();
__ Movi(v0.V2D(), 0xffeeddccbbaa9988, 0x77665544332211);
__ Movi(v1.V2D(), 0xaa5555aa55555555, 0x55aaaa55aaaaaa);
__ Movi(v2.V2D(), 0, 0);
__ Movi(v3.V2D(), 0, 0);
__ Movi(v4.V2D(), 0, 0);
__ Movi(v5.V2D(), 0, 0);
__ Movi(v6.V2D(), 0, 0);
__ Movi(v7.V2D(), 0, 0);
__ Smmla(v2.V4S(), v0.V16B(), v1.V16B());
__ Smmla(z3.VnS(), z3.VnS(), z0.VnB(), z1.VnB());
__ Ummla(v4.V4S(), v0.V16B(), v1.V16B());
__ Ummla(z5.VnS(), z5.VnS(), z0.VnB(), z1.VnB());
__ Usmmla(v6.V4S(), v0.V16B(), v1.V16B());
__ Usmmla(z7.VnS(), z7.VnS(), z0.VnB(), z1.VnB());
END();
if (CAN_RUN()) {
RUN();
// The inputs as Z registers are zero beyond the least-significant 128 bits,
// so the Neon and SVE results should be equal for any VL.
ASSERT_EQUAL_SVE(z3, z2);
ASSERT_EQUAL_SVE(z5, z4);
ASSERT_EQUAL_SVE(z7, z6);
}
}
TEST_SVE(sudot_usdot) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
CPUFeatures::kSVE2,
CPUFeatures::kSVEI8MM);
START();
__ Ptrue(p0.VnB());
__ Index(z0.VnS(), -424242, 77777);
__ Index(z1.VnB(), 127, -1);
__ Sqabs(z1.VnB(), p0.Merging(), z1.VnB());
__ Index(z2.VnB(), 0, 1);
__ Sqabs(z2.VnB(), p0.Merging(), z2.VnB());
__ Index(z3.VnB(), -128, 1);
__ Mov(z4.VnD(), 0);
// Test Usdot against Udot/Sdot over the range of inputs where they should be
// equal.
__ Usdot(z5.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
__ Udot(z6.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
__ Usdot(z7.VnS(), z0.VnS(), z1.VnB(), z3.VnB());
__ Sdot(z8.VnS(), z0.VnS(), z1.VnB(), z3.VnB());
// Construct values which, when interpreted correctly as signed/unsigned,
// should give a zero result for dot product.
__ Mov(z10.VnS(), 0x8101ff40); // [-127, 1, -1, 64] as signed bytes.
__ Mov(z11.VnS(), 0x02fe8002); // [2, 254, 128, 2] as unsigned bytes.
__ Usdot(z12.VnS(), z4.VnS(), z11.VnB(), z10.VnB());
__ Usdot(z13.VnS(), z4.VnS(), z10.VnB(), z11.VnB());
// Construct a vector with duplicated values across segments. This allows
// testing indexed dot product against the already tested variant.
__ Mov(z14.VnS(), 1);
__ Mul(z15.VnS(), z14.VnS(), z3.VnS(), 1);
__ Usdot(z16.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
__ Usdot(z17.VnS(), z0.VnS(), z3.VnB(), z15.VnB());
__ Sudot(z18.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
__ Usdot(z19.VnS(), z0.VnS(), z15.VnB(), z3.VnB());
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z6, z5);
ASSERT_EQUAL_SVE(z8, z7);
ASSERT_EQUAL_SVE(z4, z12);
uint64_t z13_expected[] = {0xffff8200ffff8200, 0xffff8200ffff8200};
ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
ASSERT_EQUAL_SVE(z17, z16);
ASSERT_EQUAL_SVE(z19, z18);
}
}
TEST_SVE(neon_ins_zero_high_regression_test) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
START();
__ Movi(v0.V2D(), 0x0f0e0d0c0b0a0908, 0x0706050403020100);
// Check that both forms of ins zero bits <VL-1:128>
__ Index(z1.VnB(), 0, 1);
__ Ins(v1.V16B(), 0, wzr);
__ Index(z2.VnB(), 0, 1);
__ Ins(v2.V16B(), 3, v2.V16B(), 3);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z0, z1);
ASSERT_EQUAL_SVE(z0, z2);
}
}
TEST_SVE(neon_fcvt_zero_high_regression_test) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP,
CPUFeatures::kNEON,
CPUFeatures::kSVE);
START();
__ Mov(z1.VnD(), 0);
__ Mov(z2.VnD(), 0);
__ Mov(z3.VnD(), 0);
__ Mov(z4.VnD(), 0);
__ Mov(z5.VnD(), 0);
__ Mov(z6.VnD(), 0);
__ Mov(z10.VnD(), 0);
Label done;
// Skip calculations for VL128.
__ Rdvl(x0, 1);
__ Cmp(x0, 16);
__ B(eq, &done);
__ Movi(v0.V2D(), 0x3ff000003f800000);
__ Index(z1.VnB(), 0, 1);
__ Index(z2.VnB(), 0, 1);
__ Index(z3.VnB(), 0, 1);
__ Index(z4.VnB(), 0, 1);
__ Index(z5.VnB(), 0, 1);
__ Index(z6.VnB(), 0, 1);
// Test zeroing bits <VL-1:128> for fcvtl, fcvtn and fcvtxn.
__ Fcvtl(v1.V2D(), v0.V2S());
__ Fcvtl2(v2.V2D(), v0.V4S());
__ Fcvtn(v3.V2S(), v0.V2D());
__ Fcvtn2(v4.V4S(), v0.V2D());
__ Fcvtxn(v5.V2S(), v0.V2D());
__ Fcvtxn2(v6.V4S(), v0.V2D());
// Set the expected non-zero bits to zero.
__ Ext(z1.VnB(), z1.VnB(), z10.VnB(), kDRegSizeInBytes * 2);
__ Ext(z2.VnB(), z2.VnB(), z10.VnB(), kDRegSizeInBytes * 2);
__ Ext(z3.VnB(), z3.VnB(), z10.VnB(), kSRegSizeInBytes * 2);
__ Ext(z4.VnB(), z4.VnB(), z10.VnB(), kSRegSizeInBytes * 4);
__ Ext(z5.VnB(), z5.VnB(), z10.VnB(), kSRegSizeInBytes * 2);
__ Ext(z6.VnB(), z6.VnB(), z10.VnB(), kSRegSizeInBytes * 4);
__ Bind(&done);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z10, z1);
ASSERT_EQUAL_SVE(z10, z2);
ASSERT_EQUAL_SVE(z10, z3);
ASSERT_EQUAL_SVE(z10, z4);
ASSERT_EQUAL_SVE(z10, z5);
ASSERT_EQUAL_SVE(z10, z6);
}
}
#define TEST_ZEROING(INST) \
__ Index(z0.VnB(), 0, 1); \
__ INST; \
__ Orr(z10.VnB(), z10.VnB(), z0.VnB());
TEST_SVE(neon_zero_high) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP,
CPUFeatures::kNEON,
CPUFeatures::kNEONHalf,
CPUFeatures::kSVE,
CPUFeatures::kFcma,
CPUFeatures::kFHM,
CPUFeatures::kFrintToFixedSizedInt,
CPUFeatures::kDotProduct,
CPUFeatures::kRDM,
CPUFeatures::kI8MM);
START();
__ Mov(z10.VnD(), 0); // Initialise cumulative result register.
TEST_ZEROING(Abs(v0.V16B(), v0.V16B()));
TEST_ZEROING(Abs(v0.V2S(), v0.V2S()));
TEST_ZEROING(Add(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Add(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Addhn2(v0.V16B(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Addhn(v0.V4H(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Addp(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Addp(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(And(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Bic(v0.V8H(), 0, 0));
TEST_ZEROING(Bic(v0.V2S(), 255, 0));
TEST_ZEROING(Bic(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Bif(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Bit(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Bsl(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Cls(v0.V16B(), v0.V16B()));
TEST_ZEROING(Cls(v0.V2S(), v0.V2S()));
TEST_ZEROING(Clz(v0.V16B(), v0.V16B()));
TEST_ZEROING(Clz(v0.V2S(), v0.V2S()));
TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), 0));
TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), 0));
TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), 0));
TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Cmhi(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Cmhi(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Cmhs(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Cmhs(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Cmle(v0.V16B(), v0.V16B(), 0));
TEST_ZEROING(Cmle(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Cmlt(v0.V16B(), v0.V16B(), 0));
TEST_ZEROING(Cmlt(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Cmtst(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Cmtst(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Cnt(v0.V16B(), v0.V16B()));
TEST_ZEROING(Dup(v0.V2S(), w0));
TEST_ZEROING(Dup(v0.V8B(), w0));
TEST_ZEROING(Dup(v0.V2S(), v0.S(), 0));
TEST_ZEROING(Dup(v0.V8B(), v0.B(), 0));
TEST_ZEROING(Eor(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Ext(v0.V16B(), v0.V16B(), v0.V16B(), 0));
TEST_ZEROING(Ext(v0.V8B(), v0.V8B(), v0.V8B(), 4));
TEST_ZEROING(Fabd(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Fabd(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fabs(v0.V4S(), v0.V4S()));
TEST_ZEROING(Fabs(v0.V8H(), v0.V8H()));
TEST_ZEROING(Facge(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Facge(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Facgt(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Facgt(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fadd(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fadd(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Faddp(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Faddp(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcadd(v0.V2S(), v0.V2S(), v0.V2S(), 90));
TEST_ZEROING(Fcadd(v0.V8H(), v0.V8H(), v0.V8H(), 90));
TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), 0));
TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), 0));
TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), 0));
TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.H(), 0, 0));
TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.S(), 0, 0));
TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.V4S(), 0));
TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.V4H(), 0));
TEST_ZEROING(Fcmle(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Fcmle(v0.V8H(), v0.V8H(), 0));
TEST_ZEROING(Fcmlt(v0.V2S(), v0.V2S(), 0));
TEST_ZEROING(Fcmlt(v0.V8H(), v0.V8H(), 0));
TEST_ZEROING(Fcvtas(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtas(v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcvtau(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtau(v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcvtl2(v0.V4S(), v0.V8H()));
TEST_ZEROING(Fcvtl(v0.V2D(), v0.V2S()));
TEST_ZEROING(Fcvtms(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtms(v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcvtmu(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtmu(v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcvtn2(v0.V8H(), v0.V4S()));
TEST_ZEROING(Fcvtn(v0.V2S(), v0.V2D()));
TEST_ZEROING(Fcvtns(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtns(v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcvtnu(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtnu(v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcvtps(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtps(v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcvtpu(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtpu(v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcvtxn(v0.V2S(), v0.V2D()));
TEST_ZEROING(Fcvtxn2(v0.V4S(), v0.V2D()));
TEST_ZEROING(Fcvtzs(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtzs(v0.V8H(), v0.V8H()));
TEST_ZEROING(Fcvtzs(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Fcvtzu(v0.V2S(), v0.V2S()));
TEST_ZEROING(Fcvtzu(v0.V4H(), v0.V4H()));
TEST_ZEROING(Fcvtzu(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Fdiv(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fdiv(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fmax(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fmax(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fmaxnm(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fmaxnm(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fmaxnmp(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fmaxnmp(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fmaxp(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fmaxp(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fmin(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fmin(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fminnm(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fminnm(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fminnmp(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fminnmp(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fminp(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Fminp(v0.V8H(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.H(), 2));
TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Fmlal2(v0.V2S(), v0.V2H(), v0.H(), 2));
TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Fmlal(v0.V2S(), v0.V2H(), v0.H(), 2));
TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.H(), 2));
TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Fmlsl2(v0.V2S(), v0.V2H(), v0.H(), 2));
TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Fmlsl(v0.V2S(), v0.V2H(), v0.H(), 2));
TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Fmov(v0.V2D(), 2.0000));
TEST_ZEROING(Fmov(v0.V4H(), 2.0000));
TEST_ZEROING(Fmov(v0.D(), 1, x1));
TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.H(), 2));
TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.H(), 2));
TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Fneg(v0.V4S(), v0.V4S()));
TEST_ZEROING(Fneg(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frecpe(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frecpe(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frecps(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Frecps(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Frint32x(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frint32z(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frint64x(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frint64z(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frinta(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frinta(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frinti(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frinti(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frintm(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frintm(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frintn(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frintn(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frintp(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frintp(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frintx(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frintx(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frintz(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frintz(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frsqrte(v0.V4S(), v0.V4S()));
TEST_ZEROING(Frsqrte(v0.V4H(), v0.V4H()));
TEST_ZEROING(Frsqrts(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Frsqrts(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Fsqrt(v0.V4S(), v0.V4S()));
TEST_ZEROING(Fsqrt(v0.V4H(), v0.V4H()));
TEST_ZEROING(Fsub(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Fsub(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Mov(v0.D(), 0, x0));
TEST_ZEROING(Mov(v0.S(), 0, w0));
TEST_ZEROING(Mov(v0.H(), 0, w0));
TEST_ZEROING(Mov(v0.B(), 0, w0));
TEST_ZEROING(Mov(v0.D(), 0, v0.D(), 0));
TEST_ZEROING(Mov(v0.S(), 0, v0.S(), 0));
TEST_ZEROING(Mov(v0.H(), 0, v0.H(), 0));
TEST_ZEROING(Mov(v0.B(), 0, v0.B(), 0));
TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Movi(v0.V2D(), 0xff));
TEST_ZEROING(Movi(v0.V2S(), 0xff));
TEST_ZEROING(Movi(v0.V4S(), 0x10, LSL, 8));
TEST_ZEROING(Movi(v0.V2S(), 0x10, LSL, 8));
TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8));
TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8));
TEST_ZEROING(Neg(v0.V4S(), v0.V4S()));
TEST_ZEROING(Neg(v0.V4H(), v0.V4H()));
TEST_ZEROING(Mvn(v0.V16B(), v0.V16B()));
TEST_ZEROING(Mvn(v0.V8B(), v0.V8B()));
TEST_ZEROING(Orn(v0.V8B(), v0.V8B(), v0.V8B()));
TEST_ZEROING(Orn(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Orr(v0.V8H(), 0x10, 8));
TEST_ZEROING(Orr(v0.V4H(), 0x10, 8));
TEST_ZEROING(Mov(v0.V8B(), v0.V8B()));
TEST_ZEROING(Mov(v0.V16B(), v0.V16B()));
TEST_ZEROING(Pmul(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Pmull(v0.V8H(), v0.V8B(), v0.V8B()));
TEST_ZEROING(Pmull2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Raddhn2(v0.V16B(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Raddhn(v0.V4H(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Rbit(v0.V8B(), v0.V8B()));
TEST_ZEROING(Rbit(v0.V16B(), v0.V16B()));
TEST_ZEROING(Rsubhn2(v0.V16B(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Rsubhn(v0.V4H(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Saba(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Saba(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Saba(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sabal2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sabal(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sabd(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sabd(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Sabd(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sabdl2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sabdl(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sadalp(v0.V8H(), v0.V16B()));
TEST_ZEROING(Saddl2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Saddl(v0.V2D(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Saddl(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Saddw2(v0.V8H(), v0.V8H(), v0.V16B()));
TEST_ZEROING(Saddw(v0.V4S(), v0.V4S(), v0.V4H()));
TEST_ZEROING(Scvtf(v0.V4S(), v0.V4S()));
TEST_ZEROING(Scvtf(v0.V8H(), v0.V8H()));
TEST_ZEROING(Scvtf(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.S4B(), 0));
TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.V8B()));
TEST_ZEROING(Shadd(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Shadd(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Shl(v0.V2D(), v0.V2D(), 56));
TEST_ZEROING(Shll2(v0.V8H(), v0.V16B(), 8));
TEST_ZEROING(Shll(v0.V2D(), v0.V2S(), 32));
TEST_ZEROING(Shsub(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Shsub(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sli(v0.V2D(), v0.V2D(), 56));
TEST_ZEROING(Sli(v0.V2S(), v0.V2S(), 16));
TEST_ZEROING(Smax(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Smax(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Smaxp(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Smaxp(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Smin(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Smin(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sminp(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sminp(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Smlal2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.S(), 0));
TEST_ZEROING(Smlsl2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
TEST_ZEROING(Smull2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.S(), 0));
TEST_ZEROING(Sqabs(v0.V16B(), v0.V16B()));
TEST_ZEROING(Sqabs(v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqadd(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sqadd(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqdmlal2(v0.V4S(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.S(), 0));
TEST_ZEROING(Sqdmlsl2(v0.V4S(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Sqneg(v0.V16B(), v0.V16B()));
TEST_ZEROING(Sqneg(v0.V2S(), v0.V2S()));
TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.S(), 0));
TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqrshl(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sqrshl(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqshl(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sqshl(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqshl(v0.V2D(), v0.V2D(), 56));
TEST_ZEROING(Sqshl(v0.V2S(), v0.V2S(), 16));
TEST_ZEROING(Sqshlu(v0.V2D(), v0.V2D(), 56));
TEST_ZEROING(Sqshlu(v0.V2S(), v0.V2S(), 16));
TEST_ZEROING(Sqsub(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sqsub(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sqxtn2(v0.V16B(), v0.V8H()));
TEST_ZEROING(Sqxtn(v0.V2S(), v0.V2D()));
TEST_ZEROING(Sqxtun2(v0.V16B(), v0.V8H()));
TEST_ZEROING(Sqxtun(v0.V2S(), v0.V2D()));
TEST_ZEROING(Srhadd(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Srhadd(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sri(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Sri(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Srshl(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Srshl(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Srshr(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Srshr(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Srsra(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Srsra(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Sshl(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sshl(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Sshr(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Sshr(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Ssra(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Ssra(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Ssubl2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Ssubl(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Ssubw2(v0.V8H(), v0.V8H(), v0.V16B()));
TEST_ZEROING(Ssubw(v0.V4S(), v0.V4S(), v0.V4H()));
TEST_ZEROING(Sub(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Sub(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Subhn2(v0.V16B(), v0.V8H(), v0.V8H()));
TEST_ZEROING(Subhn(v0.V4H(), v0.V4S(), v0.V4S()));
TEST_ZEROING(Sudot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
TEST_ZEROING(Sudot(v0.V2S(), v0.V8B(), v0.S4B(), 2));
TEST_ZEROING(Suqadd(v0.V16B(), v0.V16B()));
TEST_ZEROING(Suqadd(v0.V4H(), v0.V4H()));
TEST_ZEROING(Tbl(v0.V8B(), {v0.V16B()}, v0.V8B()));
TEST_ZEROING(Tbl(v0.V16B(), {v0.V16B()}, v0.V16B()));
TEST_ZEROING(Tbx(v0.V8B(), {v0.V16B()}, v0.V8B()));
TEST_ZEROING(Tbx(v0.V16B(), {v0.V16B()}, v0.V16B()));
TEST_ZEROING(Trn1(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Trn1(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Trn2(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Trn2(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uaba(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uaba(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uabal2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uabal(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uabd(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uabd(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uabdl2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uabdl(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uadalp(v0.V8H(), v0.V16B()));
TEST_ZEROING(Uadalp(v0.V2S(), v0.V4H()));
TEST_ZEROING(Uaddl2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uaddl(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uaddlp(v0.V8H(), v0.V16B()));
TEST_ZEROING(Uaddlp(v0.V2S(), v0.V4H()));
TEST_ZEROING(Uaddw2(v0.V8H(), v0.V8H(), v0.V16B()));
TEST_ZEROING(Uaddw(v0.V4S(), v0.V4S(), v0.V4H()));
TEST_ZEROING(Ucvtf(v0.V4S(), v0.V4S()));
TEST_ZEROING(Ucvtf(v0.V4H(), v0.V4H()));
TEST_ZEROING(Ucvtf(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Ucvtf(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.S4B(), 0));
TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.V8B()));
TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uhadd(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uhadd(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uhsub(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uhsub(v0.V2S(), v0.V2S(), v0.V2S()));
TEST_ZEROING(Umax(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Umax(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Umaxp(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Umaxp(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Umin(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Umin(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uminp(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uminp(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Umlal2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Umlal(v0.V2D(), v0.V2S(), v0.S(), 0));
TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Umlsl2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Umlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Umull2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Umull(v0.V2D(), v0.V2S(), v0.S(), 0));
TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.H(), 0));
TEST_ZEROING(Uqadd(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uqadd(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uqrshl(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uqrshl(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uqshl(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uqshl(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uqsub(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uqsub(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uqxtn2(v0.V16B(), v0.V8H()));
TEST_ZEROING(Uqxtn(v0.V2S(), v0.V2D()));
TEST_ZEROING(Urecpe(v0.V2S(), v0.V2S()));
TEST_ZEROING(Urecpe(v0.V4S(), v0.V4S()));
TEST_ZEROING(Urhadd(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Urhadd(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Urshl(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Urshl(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Urshr(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Urshr(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Ursqrte(v0.V4S(), v0.V4S()));
TEST_ZEROING(Ursqrte(v0.V2S(), v0.V2S()));
TEST_ZEROING(Ursra(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Ursra(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.S4B(), 1));
TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.V8B()));
TEST_ZEROING(Ushl(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Ushl(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Ushr(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Ushr(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Usqadd(v0.V16B(), v0.V16B()));
TEST_ZEROING(Usqadd(v0.V4H(), v0.V4H()));
TEST_ZEROING(Usra(v0.V2D(), v0.V2D(), 8));
TEST_ZEROING(Usra(v0.V2S(), v0.V2S(), 8));
TEST_ZEROING(Usubl2(v0.V8H(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Usubl(v0.V4S(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Usubw2(v0.V8H(), v0.V8H(), v0.V16B()));
TEST_ZEROING(Usubw(v0.V4S(), v0.V4S(), v0.V4H()));
TEST_ZEROING(Uzp1(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uzp1(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Uzp2(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Uzp2(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Xtn2(v0.V16B(), v0.V8H()));
TEST_ZEROING(Xtn(v0.V4H(), v0.V4S()));
TEST_ZEROING(Zip1(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Zip1(v0.V4H(), v0.V4H(), v0.V4H()));
TEST_ZEROING(Zip2(v0.V16B(), v0.V16B(), v0.V16B()));
TEST_ZEROING(Zip2(v0.V4H(), v0.V4H(), v0.V4H()));
__ Mov(z11.VnD(), 0);
Label done, zero_127_to_0;
__ Rdvl(x0, 1);
__ Cmp(x0, 16);
__ B(gt, &zero_127_to_0);
// For 128-bit VL, there's nothing to be tested, so zero the whole register.
__ Mov(z10.VnD(), 0);
__ B(&done);
// Set the expected non-zero bits to zero.
__ Bind(&zero_127_to_0);
__ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2);
__ Bind(&done);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z11, z10);
}
}
#undef TEST_ZEROING
#define TEST_ZEROING_1(INST) \
__ Index(z0.VnB(), 0, 1); \
__ INST; \
__ Orr(z10.VnB(), z10.VnB(), z0.VnB());
#define TEST_ZEROING_2(INST) \
__ Index(z0.VnB(), 0, 1); \
__ Index(z1.VnB(), 0, 1); \
__ INST; \
__ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
__ Orr(z10.VnB(), z10.VnB(), z1.VnB());
#define TEST_ZEROING_3(INST) \
__ Index(z0.VnB(), 0, 1); \
__ Index(z1.VnB(), 0, 1); \
__ Index(z2.VnB(), 0, 1); \
__ INST; \
__ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
__ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \
__ Orr(z10.VnB(), z10.VnB(), z2.VnB());
#define TEST_ZEROING_4(INST) \
__ Index(z0.VnB(), 0, 1); \
__ Index(z1.VnB(), 0, 1); \
__ Index(z2.VnB(), 0, 1); \
__ Index(z3.VnB(), 0, 1); \
__ INST; \
__ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
__ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \
__ Orr(z10.VnB(), z10.VnB(), z2.VnB()); \
__ Orr(z10.VnB(), z10.VnB(), z3.VnB());
TEST_SVE(neon_load_zero_high) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
START();
__ Mov(z10.VnD(), 0); // Initialise cumulative result register.
// Initialise x0 to point to a buffer from which data is loaded. The contents
// does not need to be defined.
int data_size = 4 * kQRegSizeInBytes;
uint8_t* data = new uint8_t[data_size];
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size]));
MemOperand mop = MemOperand(x0);
TEST_ZEROING_1(Ld1(v0.V16B(), mop));
TEST_ZEROING_1(Ld1(v0.V4H(), mop));
TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), mop));
TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), mop));
TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), mop));
TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), mop));
TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
TEST_ZEROING_1(Ld1(v0.B(), 1, mop));
TEST_ZEROING_1(Ld1(v0.D(), 1, mop));
TEST_ZEROING_1(Ld1(v0.H(), 1, mop));
TEST_ZEROING_1(Ld1(v0.S(), 1, mop));
TEST_ZEROING_1(Ld1r(v0.V16B(), mop));
TEST_ZEROING_1(Ld1r(v0.V4H(), mop));
TEST_ZEROING_2(Ld2(v0.V16B(), v1.V16B(), mop));
TEST_ZEROING_2(Ld2(v0.V4H(), v1.V4H(), mop));
TEST_ZEROING_2(Ld2(v0.B(), v1.B(), 1, mop));
TEST_ZEROING_2(Ld2(v0.D(), v1.D(), 1, mop));
TEST_ZEROING_2(Ld2(v0.H(), v1.H(), 1, mop));
TEST_ZEROING_2(Ld2(v0.S(), v1.S(), 1, mop));
TEST_ZEROING_2(Ld2r(v0.V16B(), v1.V16B(), mop));
TEST_ZEROING_2(Ld2r(v0.V4H(), v1.V4H(), mop));
TEST_ZEROING_3(Ld3(v0.V16B(), v1.V16B(), v2.V16B(), mop));
TEST_ZEROING_3(Ld3(v0.V4H(), v1.V4H(), v2.V4H(), mop));
TEST_ZEROING_3(Ld3(v0.B(), v1.B(), v2.B(), 1, mop));
TEST_ZEROING_3(Ld3(v0.D(), v1.D(), v2.D(), 1, mop));
TEST_ZEROING_3(Ld3(v0.H(), v1.H(), v2.H(), 1, mop));
TEST_ZEROING_3(Ld3(v0.S(), v1.S(), v2.S(), 1, mop));
TEST_ZEROING_3(Ld3r(v0.V16B(), v1.V16B(), v2.V16B(), mop));
TEST_ZEROING_3(Ld3r(v0.V4H(), v1.V4H(), v2.V4H(), mop));
TEST_ZEROING_4(Ld4(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
TEST_ZEROING_4(Ld4(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
TEST_ZEROING_4(Ld4(v0.B(), v1.B(), v2.B(), v3.B(), 1, mop));
TEST_ZEROING_4(Ld4(v0.D(), v1.D(), v2.D(), v3.D(), 1, mop));
TEST_ZEROING_4(Ld4(v0.H(), v1.H(), v2.H(), v3.H(), 1, mop));
TEST_ZEROING_4(Ld4(v0.S(), v1.S(), v2.S(), v3.S(), 1, mop));
TEST_ZEROING_4(Ld4r(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
TEST_ZEROING_4(Ld4r(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
__ Mov(z11.VnD(), 0);
Label done, zero_127_to_0;
__ Rdvl(x0, 1);
__ Cmp(x0, 16);
__ B(gt, &zero_127_to_0);
// For 128-bit VL, there's nothing to be tested, so zero the whole register.
__ Mov(z10.VnD(), 0);
__ B(&done);
// Set the expected non-zero bits to zero.
__ Bind(&zero_127_to_0);
__ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2);
__ Bind(&done);
END();
if (CAN_RUN()) {
RUN();
ASSERT_EQUAL_SVE(z11, z10);
}
}
#undef TEST_ZEROING_1
#undef TEST_ZEROING_2
#undef TEST_ZEROING_3
#undef TEST_ZEROING_4
TEST_SVE(sve_load_store_sp_base_regression_test) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
START();
__ Mov(x0, 0);
__ Mov(z0.VnB(), 0);
__ Ptrue(p0.VnB());
Label loop;
__ Mov(x1, 128);
__ Bind(&loop);
__ Push(xzr, xzr);
__ Sub(x1, x1, 1);
__ Cbnz(x1, &loop);
{
ExactAssemblyScope scope(&masm, 193 * kInstructionSize);
__ dci(0xa420a3e0); // ld1b {z0.h}, p0/z, [sp]
__ dci(0xa440a3e0); // ld1b {z0.s}, p0/z, [sp]
__ dci(0xa460a3e0); // ld1b {z0.d}, p0/z, [sp]
__ dci(0xa400a3e0); // ld1b {z0.b}, p0/z, [sp]
__ dci(0xa42043e0); // ld1b {z0.h}, p0/z, [sp, x0]
__ dci(0xa44043e0); // ld1b {z0.s}, p0/z, [sp, x0]
__ dci(0xa46043e0); // ld1b {z0.d}, p0/z, [sp, x0]
__ dci(0xa40043e0); // ld1b {z0.b}, p0/z, [sp, x0]
__ dci(0xc440c3e0); // ld1b {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa5e0a3e0); // ld1d {z0.d}, p0/z, [sp]
__ dci(0xa5e043e0); // ld1d {z0.d}, p0/z, [sp, x0, lsl #3]
__ dci(0xc5e0c3e0); // ld1d {z0.d}, p0/z, [sp, z0.d, lsl #3]
__ dci(0xc5c0c3e0); // ld1d {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa4a0a3e0); // ld1h {z0.h}, p0/z, [sp]
__ dci(0xa4c0a3e0); // ld1h {z0.s}, p0/z, [sp]
__ dci(0xa4e0a3e0); // ld1h {z0.d}, p0/z, [sp]
__ dci(0xa4a043e0); // ld1h {z0.h}, p0/z, [sp, x0, lsl #1]
__ dci(0xa4c043e0); // ld1h {z0.s}, p0/z, [sp, x0, lsl #1]
__ dci(0xa4e043e0); // ld1h {z0.d}, p0/z, [sp, x0, lsl #1]
__ dci(0xc4e0c3e0); // ld1h {z0.d}, p0/z, [sp, z0.d, lsl #1]
__ dci(0xc4c0c3e0); // ld1h {z0.d}, p0/z, [sp, z0.d]
__ dci(0x8440a3e0); // ld1rb {z0.h}, p0/z, [sp]
__ dci(0x8440c3e0); // ld1rb {z0.s}, p0/z, [sp]
__ dci(0x8440e3e0); // ld1rb {z0.d}, p0/z, [sp]
__ dci(0x844083e0); // ld1rb {z0.b}, p0/z, [sp]
__ dci(0x85c0e3e0); // ld1rd {z0.d}, p0/z, [sp]
__ dci(0x84c0a3e0); // ld1rh {z0.h}, p0/z, [sp]
__ dci(0x84c0c3e0); // ld1rh {z0.s}, p0/z, [sp]
__ dci(0x84c0e3e0); // ld1rh {z0.d}, p0/z, [sp]
__ dci(0xa40023e0); // ld1rqb {z0.b}, p0/z, [sp]
__ dci(0xa40003e0); // ld1rqb {z0.b}, p0/z, [sp, x0]
__ dci(0xa58023e0); // ld1rqd {z0.d}, p0/z, [sp]
__ dci(0xa58003e0); // ld1rqd {z0.d}, p0/z, [sp, x0, lsl #3]
__ dci(0xa48023e0); // ld1rqh {z0.h}, p0/z, [sp]
__ dci(0xa48003e0); // ld1rqh {z0.h}, p0/z, [sp, x0, lsl #1]
__ dci(0xa50023e0); // ld1rqw {z0.s}, p0/z, [sp]
__ dci(0xa50003e0); // ld1rqw {z0.s}, p0/z, [sp, x0, lsl #2]
__ dci(0x85c0c3e0); // ld1rsb {z0.h}, p0/z, [sp]
__ dci(0x85c0a3e0); // ld1rsb {z0.s}, p0/z, [sp]
__ dci(0x85c083e0); // ld1rsb {z0.d}, p0/z, [sp]
__ dci(0x8540a3e0); // ld1rsh {z0.s}, p0/z, [sp]
__ dci(0x854083e0); // ld1rsh {z0.d}, p0/z, [sp]
__ dci(0x84c083e0); // ld1rsw {z0.d}, p0/z, [sp]
__ dci(0x8540c3e0); // ld1rw {z0.s}, p0/z, [sp]
__ dci(0x8540e3e0); // ld1rw {z0.d}, p0/z, [sp]
__ dci(0xa5c0a3e0); // ld1sb {z0.h}, p0/z, [sp]
__ dci(0xa5a0a3e0); // ld1sb {z0.s}, p0/z, [sp]
__ dci(0xa580a3e0); // ld1sb {z0.d}, p0/z, [sp]
__ dci(0xa5c043e0); // ld1sb {z0.h}, p0/z, [sp, x0]
__ dci(0xa5a043e0); // ld1sb {z0.s}, p0/z, [sp, x0]
__ dci(0xa58043e0); // ld1sb {z0.d}, p0/z, [sp, x0]
__ dci(0xc44083e0); // ld1sb {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa520a3e0); // ld1sh {z0.s}, p0/z, [sp]
__ dci(0xa500a3e0); // ld1sh {z0.d}, p0/z, [sp]
__ dci(0xa52043e0); // ld1sh {z0.s}, p0/z, [sp, x0, lsl #1]
__ dci(0xa50043e0); // ld1sh {z0.d}, p0/z, [sp, x0, lsl #1]
__ dci(0xc4e083e0); // ld1sh {z0.d}, p0/z, [sp, z0.d, lsl #1]
__ dci(0xc4c083e0); // ld1sh {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa480a3e0); // ld1sw {z0.d}, p0/z, [sp]
__ dci(0xa48043e0); // ld1sw {z0.d}, p0/z, [sp, x0, lsl #2]
__ dci(0xc56083e0); // ld1sw {z0.d}, p0/z, [sp, z0.d, lsl #2]
__ dci(0xc54083e0); // ld1sw {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa540a3e0); // ld1w {z0.s}, p0/z, [sp]
__ dci(0xa560a3e0); // ld1w {z0.d}, p0/z, [sp]
__ dci(0xa54043e0); // ld1w {z0.s}, p0/z, [sp, x0, lsl #2]
__ dci(0xa56043e0); // ld1w {z0.d}, p0/z, [sp, x0, lsl #2]
__ dci(0xc560c3e0); // ld1w {z0.d}, p0/z, [sp, z0.d, lsl #2]
__ dci(0xc540c3e0); // ld1w {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa420e3e0); // ld2b {z0.b, z1.b}, p0/z, [sp]
__ dci(0xa420c3e0); // ld2b {z0.b, z1.b}, p0/z, [sp, x0]
__ dci(0xa5a0e3e0); // ld2d {z0.d, z1.d}, p0/z, [sp]
__ dci(0xa5a0c3e0); // ld2d {z0.d, z1.d}, p0/z, [sp, x0, lsl #3]
__ dci(0xa4a0e3e0); // ld2h {z0.h, z1.h}, p0/z, [sp]
__ dci(0xa4a0c3e0); // ld2h {z0.h, z1.h}, p0/z, [sp, x0, lsl #1]
__ dci(0xa520e3e0); // ld2w {z0.s, z1.s}, p0/z, [sp]
__ dci(0xa520c3e0); // ld2w {z0.s, z1.s}, p0/z, [sp, x0, lsl #2]
__ dci(0xa440e3e0); // ld3b {z0.b, z1.b, z2.b}, p0/z, [sp]
__ dci(0xa440c3e0); // ld3b {z0.b, z1.b, z2.b}, p0/z, [sp, x0]
__ dci(0xa5c0e3e0); // ld3d {z0.d, z1.d, z2.d}, p0/z, [sp]
__ dci(0xa5c0c3e0); // ld3d {z0.d, z1.d, z2.d}, p0/z, [sp, x0, lsl #3]
__ dci(0xa4c0e3e0); // ld3h {z0.h, z1.h, z2.h}, p0/z, [sp]
__ dci(0xa4c0c3e0); // ld3h {z0.h, z1.h, z2.h}, p0/z, [sp, x0, lsl #1]
__ dci(0xa540e3e0); // ld3w {z0.s, z1.s, z2.s}, p0/z, [sp]
__ dci(0xa540c3e0); // ld3w {z0.s, z1.s, z2.s}, p0/z, [sp, x0, lsl #2]
__ dci(0xa460e3e0); // ld4b {z0.b, z1.b, z2.b, z3.b}, p0/z, [sp]
__ dci(0xa460c3e0); // ld4b {z0.b, z1.b, z2.b, z3.b}, p0/z, [sp, x0]
__ dci(0xa5e0e3e0); // ld4d {z0.d, z1.d, z2.d, z3.d}, p0/z, [sp]
__ dci(
0xa5e0c3e0); // ld4d {z0.d, z1.d, z2.d, z3.d}, p0/z, [sp, x0, lsl #3]
__ dci(0xa4e0e3e0); // ld4h {z0.h, z1.h, z2.h, z3.h}, p0/z, [sp]
__ dci(
0xa4e0c3e0); // ld4h {z0.h, z1.h, z2.h, z3.h}, p0/z, [sp, x0, lsl #1]
__ dci(0xa560e3e0); // ld4w {z0.s, z1.s, z2.s, z3.s}, p0/z, [sp]
__ dci(
0xa560c3e0); // ld4w {z0.s, z1.s, z2.s, z3.s}, p0/z, [sp, x0, lsl #2]
__ dci(0xa42063e0); // ldff1b {z0.h}, p0/z, [sp, x0]
__ dci(0xa44063e0); // ldff1b {z0.s}, p0/z, [sp, x0]
__ dci(0xa46063e0); // ldff1b {z0.d}, p0/z, [sp, x0]
__ dci(0xa40063e0); // ldff1b {z0.b}, p0/z, [sp, x0]
__ dci(0xc440e3e0); // ldff1b {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa5e063e0); // ldff1d {z0.d}, p0/z, [sp, x0, lsl #3]
__ dci(0xc5e0e3e0); // ldff1d {z0.d}, p0/z, [sp, z0.d, lsl #3]
__ dci(0xc5c0e3e0); // ldff1d {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa4a063e0); // ldff1h {z0.h}, p0/z, [sp, x0, lsl #1]
__ dci(0xa4c063e0); // ldff1h {z0.s}, p0/z, [sp, x0, lsl #1]
__ dci(0xa4e063e0); // ldff1h {z0.d}, p0/z, [sp, x0, lsl #1]
__ dci(0xc4e0e3e0); // ldff1h {z0.d}, p0/z, [sp, z0.d, lsl #1]
__ dci(0xc4c0e3e0); // ldff1h {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa5c063e0); // ldff1sb {z0.h}, p0/z, [sp, x0]
__ dci(0xa5a063e0); // ldff1sb {z0.s}, p0/z, [sp, x0]
__ dci(0xa58063e0); // ldff1sb {z0.d}, p0/z, [sp, x0]
__ dci(0xc440a3e0); // ldff1sb {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa52063e0); // ldff1sh {z0.s}, p0/z, [sp, x0, lsl #1]
__ dci(0xa50063e0); // ldff1sh {z0.d}, p0/z, [sp, x0, lsl #1]
__ dci(0xc4e0a3e0); // ldff1sh {z0.d}, p0/z, [sp, z0.d, lsl #1]
__ dci(0xc4c0a3e0); // ldff1sh {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa48063e0); // ldff1sw {z0.d}, p0/z, [sp, x0, lsl #2]
__ dci(0xc560a3e0); // ldff1sw {z0.d}, p0/z, [sp, z0.d, lsl #2]
__ dci(0xc540a3e0); // ldff1sw {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa54063e0); // ldff1w {z0.s}, p0/z, [sp, x0, lsl #2]
__ dci(0xa56063e0); // ldff1w {z0.d}, p0/z, [sp, x0, lsl #2]
__ dci(0xc560e3e0); // ldff1w {z0.d}, p0/z, [sp, z0.d, lsl #2]
__ dci(0xc540e3e0); // ldff1w {z0.d}, p0/z, [sp, z0.d]
__ dci(0xa430a3e0); // ldnf1b {z0.h}, p0/z, [sp]
__ dci(0xa450a3e0); // ldnf1b {z0.s}, p0/z, [sp]
__ dci(0xa470a3e0); // ldnf1b {z0.d}, p0/z, [sp]
__ dci(0xa410a3e0); // ldnf1b {z0.b}, p0/z, [sp]
__ dci(0xa5f0a3e0); // ldnf1d {z0.d}, p0/z, [sp]
__ dci(0xa4b0a3e0); // ldnf1h {z0.h}, p0/z, [sp]
__ dci(0xa4d0a3e0); // ldnf1h {z0.s}, p0/z, [sp]
__ dci(0xa4f0a3e0); // ldnf1h {z0.d}, p0/z, [sp]
__ dci(0xa5d0a3e0); // ldnf1sb {z0.h}, p0/z, [sp]
__ dci(0xa5b0a3e0); // ldnf1sb {z0.s}, p0/z, [sp]
__ dci(0xa590a3e0); // ldnf1sb {z0.d}, p0/z, [sp]
__ dci(0xa530a3e0); // ldnf1sh {z0.s}, p0/z, [sp]
__ dci(0xa510a3e0); // ldnf1sh {z0.d}, p0/z, [sp]
__ dci(0xa490a3e0); // ldnf1sw {z0.d}, p0/z, [sp]
__ dci(0xa550a3e0); // ldnf1w {z0.s}, p0/z, [sp]
__ dci(0xa570a3e0); // ldnf1w {z0.d}, p0/z, [sp]
__ dci(0xa400e3e0); // ldnt1b {z0.b}, p0/z, [sp]
__ dci(0xa400c3e0); // ldnt1b {z0.b}, p0/z, [sp, x0]
__ dci(0xa580e3e0); // ldnt1d {z0.d}, p0/z, [sp]
__ dci(0xa580c3e0); // ldnt1d {z0.d}, p0/z, [sp, x0, lsl #3]
__ dci(0xa480e3e0); // ldnt1h {z0.h}, p0/z, [sp]
__ dci(0xa480c3e0); // ldnt1h {z0.h}, p0/z, [sp, x0, lsl #1]
__ dci(0xa500e3e0); // ldnt1w {z0.s}, p0/z, [sp]
__ dci(0xa500c3e0); // ldnt1w {z0.s}, p0/z, [sp, x0, lsl #2]
__ dci(0x858043e0); // ldr z0, [sp]
__ dci(0xe400e3e0); // st1b {z0.b}, p0, [sp]
__ dci(0xe40043e0); // st1b {z0.b}, p0, [sp, x0]
__ dci(0xe400a3e0); // st1b {z0.d}, p0, [sp, z0.d]
__ dci(0xe5e0e3e0); // st1d {z0.d}, p0, [sp]
__ dci(0xe5e043e0); // st1d {z0.d}, p0, [sp, x0, lsl #3]
__ dci(0xe5a0a3e0); // st1d {z0.d}, p0, [sp, z0.d, lsl #3]
__ dci(0xe580a3e0); // st1d {z0.d}, p0, [sp, z0.d]
__ dci(0xe4e0e3e0); // st1h {z0.d}, p0, [sp]
__ dci(0xe4e043e0); // st1h {z0.d}, p0, [sp, x0, lsl #1]
__ dci(0xe4a0a3e0); // st1h {z0.d}, p0, [sp, z0.d, lsl #1]
__ dci(0xe480a3e0); // st1h {z0.d}, p0, [sp, z0.d]
__ dci(0xe560e3e0); // st1w {z0.d}, p0, [sp]
__ dci(0xe56043e0); // st1w {z0.d}, p0, [sp, x0, lsl #2]
__ dci(0xe430e3e0); // st2b {z0.b, z1.b}, p0, [sp]
__ dci(0xe42063e0); // st2b {z0.b, z1.b}, p0, [sp, x0]
__ dci(0xe5b0e3e0); // st2d {z0.d, z1.d}, p0, [sp]
__ dci(0xe5a063e0); // st2d {z0.d, z1.d}, p0, [sp, x0, lsl #3]
__ dci(0xe4b0e3e0); // st2h {z0.h, z1.h}, p0, [sp]
__ dci(0xe4a063e0); // st2h {z0.h, z1.h}, p0, [sp, x0, lsl #1]
__ dci(0xe530e3e0); // st2w {z0.s, z1.s}, p0, [sp]
__ dci(0xe52063e0); // st2w {z0.s, z1.s}, p0, [sp, x0, lsl #2]
__ dci(0xe450e3e0); // st3b {z0.b, z1.b, z2.b}, p0, [sp]
__ dci(0xe44063e0); // st3b {z0.b, z1.b, z2.b}, p0, [sp, x0]
__ dci(0xe5d0e3e0); // st3d {z0.d, z1.d, z2.d}, p0, [sp]
__ dci(0xe5c063e0); // st3d {z0.d, z1.d, z2.d}, p0, [sp, x0, lsl #3]
__ dci(0xe4d0e3e0); // st3h {z0.h, z1.h, z2.h}, p0, [sp]
__ dci(0xe4c063e0); // st3h {z0.h, z1.h, z2.h}, p0, [sp, x0, lsl #1]
__ dci(0xe550e3e0); // st3w {z0.s, z1.s, z2.s}, p0, [sp]
__ dci(0xe54063e0); // st3w {z0.s, z1.s, z2.s}, p0, [sp, x0, lsl #2]
__ dci(0xe470e3e0); // st4b {z0.b, z1.b, z2.b, z3.b}, p0, [sp]
__ dci(0xe46063e0); // st4b {z0.b, z1.b, z2.b, z3.b}, p0, [sp, x0]
__ dci(0xe5f0e3e0); // st4d {z0.d, z1.d, z2.d, z3.d}, p0, [sp]
__ dci(0xe5e063e0); // st4d {z0.d, z1.d, z2.d, z3.d}, p0, [sp, x0, lsl #3]
__ dci(0xe4f0e3e0); // st4h {z0.h, z1.h, z2.h, z3.h}, p0, [sp]
__ dci(0xe4e063e0); // st4h {z0.h, z1.h, z2.h, z3.h}, p0, [sp, x0, lsl #1]
__ dci(0xe570e3e0); // st4w {z0.s, z1.s, z2.s, z3.s}, p0, [sp]
__ dci(0xe56063e0); // st4w {z0.s, z1.s, z2.s, z3.s}, p0, [sp, x0, lsl #2]
__ dci(0xe410e3e0); // stnt1b {z0.b}, p0, [sp]
__ dci(0xe40063e0); // stnt1b {z0.b}, p0, [sp, x0]
__ dci(0xe590e3e0); // stnt1d {z0.d}, p0, [sp]
__ dci(0xe58063e0); // stnt1d {z0.d}, p0, [sp, x0, lsl #3]
__ dci(0xe490e3e0); // stnt1h {z0.h}, p0, [sp]
__ dci(0xe48063e0); // stnt1h {z0.h}, p0, [sp, x0, lsl #1]
__ dci(0xe510e3e0); // stnt1w {z0.s}, p0, [sp]
__ dci(0xe50063e0); // stnt1w {z0.s}, p0, [sp, x0, lsl #2]
__ dci(0x858003e0); // ldr p0, [sp]
__ dci(0xe58003e0); // str p0, [sp]
__ dci(0xe58043e0); // str z0, [sp]
}
__ Drop(128 * 2 * kXRegSizeInBytes);
END();
if (CAN_RUN()) {
RUN();
// No checks are made here. The test is designed to ensure that the base
// register is interpreted correctly as sp, not xzr. If it is interpreted
// as xzr, the memory access to addresses near zero will fault, and the
// test will fail.
}
}
// Manually constructed simulator test to avoid creating a VL128 variant.
#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
void Test_sve_fmatmul(Test* config) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);
// Only double-precision matrix multiply is tested here. Single-precision is
// tested in the simulator tests using a generated sequence. The (templated)
// code used in the simulator for both cases is the same, which is why the
// tests here don't need to be comprehensive.
START();
Label vl_too_short;
__ Rdvl(x0, 1);
__ Cmp(x0, 32);
__ B(lt, &vl_too_short); // Skip testing VL128.
__ Fdup(z0.VnD(), 1.0);
__ Fdup(z1.VnD(), 2.0);
__ Mov(z2.VnD(), 0);
// Build 2x2 identity matrix in z3.
Label iden_loop;
__ Lsr(x0, x0, 5);
__ Bind(&iden_loop);
__ Insr(z3.VnD(), d0);
__ Insr(z3.VnD(), d2);
__ Insr(z3.VnD(), d2);
__ Insr(z3.VnD(), d0);
__ Sub(x0, x0, 1);
__ Cbnz(x0, &iden_loop);
__ Fmmla(z1.VnD(), z1.VnD(), z0.VnD(), z0.VnD());
__ Fmmla(z2.VnD(), z2.VnD(), z1.VnD(), z3.VnD());
__ Ptrue(p0.VnB());
__ Index(z4.VnD(), -8, 3);
__ Scvtf(z4.VnD(), p0.Merging(), z4.VnD());
__ Mov(z5.VnD(), 0);
__ Fmmla(z4.VnD(), z4.VnD(), z4.VnD(), z4.VnD());
__ Fmmla(z5.VnD(), z5.VnD(), z4.VnD(), z3.VnD());
__ Bind(&vl_too_short);
END();
if (CAN_RUN()) {
RUN();
if (core.GetSVELaneCount(kDRegSize) >= 4) { // VL256 or longer.
ASSERT_EQUAL_SVE(z1, z2);
ASSERT_EQUAL_SVE(z4, z5);
// All results are 4.0:
// z0 z0 z1
// (1 1)(1 1) + (2 2) = (4 4)
// (1 1)(1 1) (2 2) (4 4)
uint64_t z1_expected[] =
{0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
0x4010000000000000, 0x4010000000000000};
ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
// First (highest z4_expected index) multiplications are:
// z4 z4 z4
// (-8 -5)(-8 -2) + (-8 -5) = (81 6)
// (-2 1)(-5 1) (-2 1) ( 9 6)
//
// ( 4 7)( 4 10) + ( 4 7) = ( 69 138)
// (10 13)( 7 13) (10 13) (141 282)
uint64_t z4_expected[] = {
0x40cb690000000000, 0x40c9728000000000, 0x40c9710000000000,
0x40c79e8000000000, 0x40c41f0000000000, 0x40c2708000000000,
0x40c26f0000000000, 0x40c0e48000000000, 0x40bbea0000000000,
0x40b91d0000000000, 0x40b91a0000000000, 0x40b6950000000000,
0x40b1d60000000000, 0x40af320000000000, 0x40af2c0000000000,
0x40ab420000000000, 0x40a4040000000000, 0x40a0aa0000000000,
0x40a0a40000000000, 0x409bb40000000000, 0x4091b80000000000,
0x408a880000000000, 0x408a700000000000, 0x4083c80000000000,
0x4071a00000000000, 0x4061a00000000000, 0x4061400000000000,
0x4051400000000000, 0x4018000000000000, 0x4022000000000000,
0x4018000000000000, 0x4054400000000000,
};
ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
}
}
}
Test* test_sve_fmatmul_list[] =
{Test::MakeSVETest(256, "AARCH64_ASM_sve_fmatmul_vl256", &Test_sve_fmatmul),
Test::MakeSVETest(512, "AARCH64_ASM_sve_fmatmul_vl512", &Test_sve_fmatmul),
Test::MakeSVETest(2048,
"AARCH64_ASM_sve_fmatmul_vl2048",
&Test_sve_fmatmul)};
void Test_sve_ld1ro(Test* config) {
SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);
START();
int data_size = (kQRegSizeInBytes + 128) * 4;
uint8_t* data = new uint8_t[data_size];
for (int i = 0; i < data_size; i++) {
data[i] = i & 0xff;
}
// Set the base to just past half-way through the buffer so we can use
// negative indices.
__ Mov(x0, reinterpret_cast<uintptr_t>(&data[7 + data_size / 2]));
__ Index(z0.VnB(), 0, 1);
__ Ptrue(p0.VnB());
__ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
__ Pfalse(p1.VnB());
__ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
__ Ptrue(p2.VnB());
__ Mov(x1, -32);
__ Ld1rob(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, -32));
__ Ld1rob(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
__ Mov(x1, 64 / 2);
__ Ld1roh(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, 64));
__ Ld1roh(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
__ Mov(x1, -96 / 4);
__ Ld1row(z4.VnS(), p2.Zeroing(), SVEMemOperand(x0, -96));
__ Ld1row(z5.VnS(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
__ Mov(x1, 128 / 8);
__ Ld1rod(z6.VnD(), p2.Zeroing(), SVEMemOperand(x0, 128));
__ Ld1rod(z7.VnD(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
// Check that all 256-bit segments match by rotating the vector by one
// segment, eoring, and orring across the vector.
__ Dup(z11.VnQ(), z0.VnQ(), 2);
__ Mov(z8, z0);
__ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
__ Eor(z8.VnB(), z8.VnB(), z0.VnB());
__ Orv(b9, p2, z8.VnB());
__ Mov(z8, z2);
__ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
__ Eor(z8.VnB(), z8.VnB(), z2.VnB());
__ Orv(b8, p2, z8.VnB());
__ Orr(z9, z9, z8);
__ Mov(z8, z4);
__ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
__ Eor(z8.VnB(), z8.VnB(), z4.VnB());
__ Orv(b8, p2, z8.VnB());
__ Orr(z9, z9, z8);
__ Mov(z8, z6);
__ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
__ Eor(z8.VnB(), z8.VnB(), z6.VnB());
__ Orv(b8, p2, z8.VnB());
__ Orr(z9, z9, z8);
END();
if (CAN_RUN()) {
RUN();
int vl = core.GetSVELaneCount(kBRegSize) * 8;
if (vl >= 256) {
ASSERT_EQUAL_SVE(z0, z1);
ASSERT_EQUAL_SVE(z2, z3);
ASSERT_EQUAL_SVE(z4, z5);
ASSERT_EQUAL_SVE(z6, z7);
// Check the result of the rotate/eor sequence.
uint64_t expected_z9[] = {0, 0};
ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
}
}
}
Test* test_sve_ld1ro_list[] =
{Test::MakeSVETest(256, "AARCH64_ASM_sve_ld1ro_vl256", &Test_sve_ld1ro),
Test::MakeSVETest(512, "AARCH64_ASM_sve_ld1ro_vl512", &Test_sve_ld1ro),
Test::MakeSVETest(2048, "AARCH64_ASM_sve_ld1ro_vl2048", &Test_sve_ld1ro)};
#endif
} // namespace aarch64
} // namespace vixl