blob: 0b7339dc2b6b5f165daae7580ef5440c82c337c4 [file] [log] [blame]
Jacob Bramleyd77a8e42019-02-12 16:52:24 +00001// Copyright 2019, VIXL authors
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are met:
6//
7// * Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// * Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// * Neither the name of ARM Limited nor the names of its contributors may be
13// used to endorse or promote products derived from this software without
14// specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27#include <sys/mman.h>
Jacob Bramley85a9c102019-12-09 17:48:29 +000028#include <unistd.h>
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000029
30#include <cfloat>
31#include <cmath>
32#include <cstdio>
33#include <cstdlib>
34#include <cstring>
TatWai Chong1af34f12020-06-01 20:54:06 -070035#include <functional>
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000036
37#include "test-runner.h"
38#include "test-utils.h"
39#include "aarch64/test-utils-aarch64.h"
40
41#include "aarch64/cpu-aarch64.h"
42#include "aarch64/disasm-aarch64.h"
43#include "aarch64/macro-assembler-aarch64.h"
44#include "aarch64/simulator-aarch64.h"
45#include "test-assembler-aarch64.h"
46
47namespace vixl {
48namespace aarch64 {
49
Jacob Bramleye8289202019-07-31 11:25:23 +010050Test* MakeSVETest(int vl, const char* name, Test::TestFunctionWithConfig* fn) {
51 // We never free this memory, but we need it to live for as long as the static
52 // linked list of tests, and this is the easiest way to do it.
53 Test* test = new Test(name, fn);
54 test->set_sve_vl_in_bits(vl);
55 return test;
56}
57
58// The TEST_SVE macro works just like the usual TEST macro, but the resulting
59// function receives a `const Test& config` argument, to allow it to query the
60// vector length.
61#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
62// On the Simulator, run SVE tests with several vector lengths, including the
63// extreme values and an intermediate value that isn't a power of two.
64
65#define TEST_SVE(name) \
66 void Test##name(Test* config); \
67 Test* test_##name##_list[] = \
68 {MakeSVETest(128, "AARCH64_ASM_" #name "_vl128", &Test##name), \
69 MakeSVETest(384, "AARCH64_ASM_" #name "_vl384", &Test##name), \
70 MakeSVETest(2048, "AARCH64_ASM_" #name "_vl2048", &Test##name)}; \
71 void Test##name(Test* config)
72
73#define SVE_SETUP_WITH_FEATURES(...) \
74 SETUP_WITH_FEATURES(__VA_ARGS__); \
75 simulator.SetVectorLengthInBits(config->sve_vl_in_bits())
76
77#else
78// Otherwise, just use whatever the hardware provides.
79static const int kSVEVectorLengthInBits =
80 CPUFeatures::InferFromOS().Has(CPUFeatures::kSVE)
81 ? CPU::ReadSVEVectorLengthInBits()
82 : 0;
83
84#define TEST_SVE(name) \
85 void Test##name(Test* config); \
86 Test* test_##name##_vlauto = MakeSVETest(kSVEVectorLengthInBits, \
87 "AARCH64_ASM_" #name "_vlauto", \
88 &Test##name); \
89 void Test##name(Test* config)
90
91#define SVE_SETUP_WITH_FEATURES(...) \
92 SETUP_WITH_FEATURES(__VA_ARGS__); \
93 USE(config)
94
95#endif
96
Jacob Bramley03c0b512019-02-22 16:42:06 +000097// Call masm->Insr repeatedly to allow test inputs to be set up concisely. This
98// is optimised for call-site clarity, not generated code quality, so it doesn't
99// exist in the MacroAssembler itself.
100//
101// Usage:
102//
103// int values[] = { 42, 43, 44 };
104// InsrHelper(&masm, z0.VnS(), values); // Sets z0.S = { ..., 42, 43, 44 }
105//
106// The rightmost (highest-indexed) array element maps to the lowest-numbered
107// lane.
108template <typename T, size_t N>
109void InsrHelper(MacroAssembler* masm,
110 const ZRegister& zdn,
111 const T (&values)[N]) {
112 for (size_t i = 0; i < N; i++) {
113 masm->Insr(zdn, values[i]);
114 }
115}
116
Jacob Bramley0ce75842019-07-17 18:12:50 +0100117// Conveniently initialise P registers with scalar bit patterns. The destination
118// lane size is ignored. This is optimised for call-site clarity, not generated
119// code quality.
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100120//
121// Usage:
122//
Jacob Bramley0ce75842019-07-17 18:12:50 +0100123// Initialise(&masm, p0, 0x1234); // Sets p0 = 0b'0001'0010'0011'0100
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100124void Initialise(MacroAssembler* masm,
Jacob Bramley0ce75842019-07-17 18:12:50 +0100125 const PRegister& pd,
126 uint64_t value3,
127 uint64_t value2,
128 uint64_t value1,
129 uint64_t value0) {
130 // Generate a literal pool, as in the array form.
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100131 UseScratchRegisterScope temps(masm);
132 Register temp = temps.AcquireX();
133 Label data;
134 Label done;
135
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100136 masm->Adr(temp, &data);
Jacob Bramley66e66712019-08-02 17:45:32 +0100137 masm->Ldr(pd, SVEMemOperand(temp));
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100138 masm->B(&done);
139 {
140 ExactAssemblyScope total(masm, kPRegMaxSizeInBytes);
141 masm->bind(&data);
Jacob Bramley0ce75842019-07-17 18:12:50 +0100142 masm->dc64(value0);
143 masm->dc64(value1);
144 masm->dc64(value2);
145 masm->dc64(value3);
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100146 }
147 masm->Bind(&done);
148}
Jacob Bramley0ce75842019-07-17 18:12:50 +0100149void Initialise(MacroAssembler* masm,
150 const PRegister& pd,
151 uint64_t value2,
152 uint64_t value1,
153 uint64_t value0) {
154 Initialise(masm, pd, 0, value2, value1, value0);
155}
156void Initialise(MacroAssembler* masm,
157 const PRegister& pd,
158 uint64_t value1,
159 uint64_t value0) {
160 Initialise(masm, pd, 0, 0, value1, value0);
161}
162void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value0) {
163 Initialise(masm, pd, 0, 0, 0, value0);
164}
165
166// Conveniently initialise P registers by lane. This is optimised for call-site
167// clarity, not generated code quality.
168//
169// Usage:
170//
171// int values[] = { 0x0, 0x1, 0x2 };
172// Initialise(&masm, p0.VnS(), values); // Sets p0 = 0b'0000'0001'0010
173//
174// The rightmost (highest-indexed) array element maps to the lowest-numbered
175// lane. Unspecified lanes are set to 0 (inactive).
176//
177// Each element of the `values` array is mapped onto a lane in `pd`. The
178// architecture only respects the lower bit, and writes zero the upper bits, but
179// other (encodable) values can be specified if required by the test.
180template <typename T, size_t N>
181void Initialise(MacroAssembler* masm,
182 const PRegisterWithLaneSize& pd,
183 const T (&values)[N]) {
184 // Turn the array into 64-bit chunks.
185 uint64_t chunks[4] = {0, 0, 0, 0};
186 VIXL_STATIC_ASSERT(sizeof(chunks) == kPRegMaxSizeInBytes);
187
188 int p_bits_per_lane = pd.GetLaneSizeInBits() / kZRegBitsPerPRegBit;
189 VIXL_ASSERT((64 % p_bits_per_lane) == 0);
190 VIXL_ASSERT((N * p_bits_per_lane) <= kPRegMaxSize);
191
192 uint64_t p_lane_mask = GetUintMask(p_bits_per_lane);
193
194 VIXL_STATIC_ASSERT(N <= kPRegMaxSize);
195 size_t bit = 0;
196 for (int n = static_cast<int>(N - 1); n >= 0; n--) {
197 VIXL_ASSERT(bit < (sizeof(chunks) * kBitsPerByte));
198 uint64_t value = values[n] & p_lane_mask;
199 chunks[bit / 64] |= value << (bit % 64);
200 bit += p_bits_per_lane;
201 }
202
203 Initialise(masm, pd, chunks[3], chunks[2], chunks[1], chunks[0]);
204}
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100205
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000206// Ensure that basic test infrastructure works.
Jacob Bramleye8289202019-07-31 11:25:23 +0100207TEST_SVE(sve_test_infrastructure_z) {
208 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000209 START();
210
Jacob Bramley03c0b512019-02-22 16:42:06 +0000211 __ Mov(x0, 0x0123456789abcdef);
212
213 // Test basic `Insr` behaviour.
214 __ Insr(z0.VnB(), 1);
215 __ Insr(z0.VnB(), 2);
216 __ Insr(z0.VnB(), x0);
217 __ Insr(z0.VnB(), -42);
218 __ Insr(z0.VnB(), 0);
219
220 // Test array inputs.
221 int z1_inputs[] = {3, 4, 5, -42, 0};
222 InsrHelper(&masm, z1.VnH(), z1_inputs);
223
224 // Test that sign-extension works as intended for various lane sizes.
225 __ Dup(z2.VnD(), 0); // Clear the register first.
226 __ Insr(z2.VnB(), -42); // 0xd6
227 __ Insr(z2.VnB(), 0xfe); // 0xfe
228 __ Insr(z2.VnH(), -42); // 0xffd6
229 __ Insr(z2.VnH(), 0xfedc); // 0xfedc
230 __ Insr(z2.VnS(), -42); // 0xffffffd6
231 __ Insr(z2.VnS(), 0xfedcba98); // 0xfedcba98
232 // Use another register for VnD(), so we can support 128-bit Z registers.
233 __ Insr(z3.VnD(), -42); // 0xffffffffffffffd6
234 __ Insr(z3.VnD(), 0xfedcba9876543210); // 0xfedcba9876543210
235
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000236 END();
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000237
Jacob Bramley119bd212019-04-16 10:13:09 +0100238 if (CAN_RUN()) {
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100239 RUN();
Jacob Bramley03c0b512019-02-22 16:42:06 +0000240
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100241 // Test that array checks work properly on a register initialised
242 // lane-by-lane.
243 int z0_inputs_b[] = {0x01, 0x02, 0xef, 0xd6, 0x00};
244 ASSERT_EQUAL_SVE(z0_inputs_b, z0.VnB());
Jacob Bramley03c0b512019-02-22 16:42:06 +0000245
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100246 // Test that lane-by-lane checks work properly on a register initialised
247 // by array.
248 for (size_t i = 0; i < ArrayLength(z1_inputs); i++) {
249 // The rightmost (highest-indexed) array element maps to the
250 // lowest-numbered lane.
251 int lane = static_cast<int>(ArrayLength(z1_inputs) - i - 1);
252 ASSERT_EQUAL_SVE_LANE(z1_inputs[i], z1.VnH(), lane);
Jacob Bramley03c0b512019-02-22 16:42:06 +0000253 }
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100254
255 uint64_t z2_inputs_d[] = {0x0000d6feffd6fedc, 0xffffffd6fedcba98};
256 ASSERT_EQUAL_SVE(z2_inputs_d, z2.VnD());
257 uint64_t z3_inputs_d[] = {0xffffffffffffffd6, 0xfedcba9876543210};
258 ASSERT_EQUAL_SVE(z3_inputs_d, z3.VnD());
Jacob Bramley119bd212019-04-16 10:13:09 +0100259 }
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000260}
261
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100262// Ensure that basic test infrastructure works.
Jacob Bramleye8289202019-07-31 11:25:23 +0100263TEST_SVE(sve_test_infrastructure_p) {
264 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100265 START();
266
267 // Simple cases: move boolean (0 or 1) values.
268
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100269 int p0_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100270 Initialise(&masm, p0.VnB(), p0_inputs);
271
272 int p1_inputs[] = {1, 0, 1, 1, 0, 1, 1, 1};
273 Initialise(&masm, p1.VnH(), p1_inputs);
274
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100275 int p2_inputs[] = {1, 1, 0, 1};
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100276 Initialise(&masm, p2.VnS(), p2_inputs);
277
278 int p3_inputs[] = {0, 1};
279 Initialise(&masm, p3.VnD(), p3_inputs);
280
281 // Advanced cases: move numeric value into architecturally-ignored bits.
282
283 // B-sized lanes get one bit in a P register, so there are no ignored bits.
284
285 // H-sized lanes get two bits in a P register.
286 int p4_inputs[] = {0x3, 0x2, 0x1, 0x0, 0x1, 0x2, 0x3};
287 Initialise(&masm, p4.VnH(), p4_inputs);
288
289 // S-sized lanes get four bits in a P register.
290 int p5_inputs[] = {0xc, 0x7, 0x9, 0x6, 0xf};
291 Initialise(&masm, p5.VnS(), p5_inputs);
292
293 // D-sized lanes get eight bits in a P register.
294 int p6_inputs[] = {0x81, 0xcc, 0x55};
295 Initialise(&masm, p6.VnD(), p6_inputs);
296
297 // The largest possible P register has 32 bytes.
298 int p7_inputs[] = {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
299 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
300 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
301 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f};
302 Initialise(&masm, p7.VnD(), p7_inputs);
303
304 END();
305
306 if (CAN_RUN()) {
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100307 RUN();
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100308
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100309 // Test that lane-by-lane checks work properly. The rightmost
310 // (highest-indexed) array element maps to the lowest-numbered lane.
311 for (size_t i = 0; i < ArrayLength(p0_inputs); i++) {
312 int lane = static_cast<int>(ArrayLength(p0_inputs) - i - 1);
313 ASSERT_EQUAL_SVE_LANE(p0_inputs[i], p0.VnB(), lane);
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100314 }
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100315 for (size_t i = 0; i < ArrayLength(p1_inputs); i++) {
316 int lane = static_cast<int>(ArrayLength(p1_inputs) - i - 1);
317 ASSERT_EQUAL_SVE_LANE(p1_inputs[i], p1.VnH(), lane);
318 }
319 for (size_t i = 0; i < ArrayLength(p2_inputs); i++) {
320 int lane = static_cast<int>(ArrayLength(p2_inputs) - i - 1);
321 ASSERT_EQUAL_SVE_LANE(p2_inputs[i], p2.VnS(), lane);
322 }
323 for (size_t i = 0; i < ArrayLength(p3_inputs); i++) {
324 int lane = static_cast<int>(ArrayLength(p3_inputs) - i - 1);
325 ASSERT_EQUAL_SVE_LANE(p3_inputs[i], p3.VnD(), lane);
326 }
327
328 // Test that array checks work properly on predicates initialised with a
329 // possibly-different lane size.
330 // 0b...11'10'01'00'01'10'11
331 int p4_expected[] = {0x39, 0x1b};
332 ASSERT_EQUAL_SVE(p4_expected, p4.VnD());
333
334 ASSERT_EQUAL_SVE(p5_inputs, p5.VnS());
335
336 // 0b...10000001'11001100'01010101
337 int p6_expected[] = {2, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1};
338 ASSERT_EQUAL_SVE(p6_expected, p6.VnH());
339
340 // 0b...10011100'10011101'10011110'10011111
341 int p7_expected[] = {1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
342 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1};
343 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100344 }
345}
346
Jacob Bramley935b15b2019-07-04 14:09:22 +0100347// Test that writes to V registers clear the high bits of the corresponding Z
348// register.
Jacob Bramleye8289202019-07-31 11:25:23 +0100349TEST_SVE(sve_v_write_clear) {
350 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON,
351 CPUFeatures::kFP,
352 CPUFeatures::kSVE);
Jacob Bramley935b15b2019-07-04 14:09:22 +0100353 START();
354
355 // The Simulator has two mechansisms for writing V registers:
356 // - Write*Register, calling through to SimRegisterBase::Write.
357 // - LogicVRegister::ClearForWrite followed by one or more lane updates.
358 // Try to cover both variants.
359
360 // Prepare some known inputs.
361 uint8_t data[kQRegSizeInBytes];
362 for (size_t i = 0; i < kQRegSizeInBytes; i++) {
363 data[i] = 42 + i;
364 }
365 __ Mov(x10, reinterpret_cast<uintptr_t>(data));
366 __ Fmov(d30, 42.0);
367
Jacob Bramley199339d2019-08-05 18:49:13 +0100368 // Use Index to label the lane indices, so failures are easy to detect and
Jacob Bramley935b15b2019-07-04 14:09:22 +0100369 // diagnose.
370 __ Index(z0.VnB(), 0, 1);
371 __ Index(z1.VnB(), 0, 1);
372 __ Index(z2.VnB(), 0, 1);
373 __ Index(z3.VnB(), 0, 1);
374 __ Index(z4.VnB(), 0, 1);
375
376 __ Index(z10.VnB(), 0, -1);
377 __ Index(z11.VnB(), 0, -1);
378 __ Index(z12.VnB(), 0, -1);
379 __ Index(z13.VnB(), 0, -1);
380 __ Index(z14.VnB(), 0, -1);
381
382 // Instructions using Write*Register (and SimRegisterBase::Write).
383 __ Ldr(b0, MemOperand(x10));
384 __ Fcvt(h1, d30);
385 __ Fmov(s2, 1.5f);
386 __ Fmov(d3, d30);
387 __ Ldr(q4, MemOperand(x10));
388
389 // Instructions using LogicVRegister::ClearForWrite.
390 // These also (incidentally) test that across-lane instructions correctly
391 // ignore the high-order Z register lanes.
392 __ Sminv(b10, v10.V16B());
393 __ Addv(h11, v11.V4H());
394 __ Saddlv(s12, v12.V8H());
395 __ Dup(v13.V8B(), b13, kDRegSizeInBytes);
396 __ Uaddl(v14.V8H(), v14.V8B(), v14.V8B());
397
398 END();
399
400 if (CAN_RUN()) {
401 RUN();
402
403 // Check the Q part first.
404 ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000002a, v0);
405 ASSERT_EQUAL_128(0x0000000000000000, 0x0000000000005140, v1); // 42.0 (f16)
406 ASSERT_EQUAL_128(0x0000000000000000, 0x000000003fc00000, v2); // 1.5 (f32)
407 ASSERT_EQUAL_128(0x0000000000000000, 0x4045000000000000, v3); // 42.0 (f64)
408 ASSERT_EQUAL_128(0x3938373635343332, 0x31302f2e2d2c2b2a, v4);
409 ASSERT_EQUAL_128(0x0000000000000000, 0x00000000000000f1, v10); // -15
410 // 0xf9fa + 0xfbfc + 0xfdfe + 0xff00 -> 0xf2f4
411 ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000f2f4, v11);
412 // 0xfffff1f2 + 0xfffff3f4 + ... + 0xfffffdfe + 0xffffff00 -> 0xffffc6c8
413 ASSERT_EQUAL_128(0x0000000000000000, 0x00000000ffffc6c8, v12);
414 ASSERT_EQUAL_128(0x0000000000000000, 0xf8f8f8f8f8f8f8f8, v13); // [-8] x 8
415 // [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
416 // + [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
417 // -> [0x01f2, 0x01f4, 0x01f6, 0x01f8, 0x01fa, 0x01fc, 0x01fe, 0x0000]
418 ASSERT_EQUAL_128(0x01f201f401f601f8, 0x01fa01fc01fe0000, v14);
419
420 // Check that the upper lanes are all clear.
421 for (int i = kQRegSizeInBytes; i < core.GetSVELaneCount(kBRegSize); i++) {
422 ASSERT_EQUAL_SVE_LANE(0x00, z0.VnB(), i);
423 ASSERT_EQUAL_SVE_LANE(0x00, z1.VnB(), i);
424 ASSERT_EQUAL_SVE_LANE(0x00, z2.VnB(), i);
425 ASSERT_EQUAL_SVE_LANE(0x00, z3.VnB(), i);
426 ASSERT_EQUAL_SVE_LANE(0x00, z4.VnB(), i);
427 ASSERT_EQUAL_SVE_LANE(0x00, z10.VnB(), i);
428 ASSERT_EQUAL_SVE_LANE(0x00, z11.VnB(), i);
429 ASSERT_EQUAL_SVE_LANE(0x00, z12.VnB(), i);
430 ASSERT_EQUAL_SVE_LANE(0x00, z13.VnB(), i);
431 ASSERT_EQUAL_SVE_LANE(0x00, z14.VnB(), i);
432 }
433 }
434}
435
Jacob Bramleye8289202019-07-31 11:25:23 +0100436static void MlaMlsHelper(Test* config, unsigned lane_size_in_bits) {
437 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley22023df2019-05-14 17:55:43 +0100438 START();
439
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100440 int zd_inputs[] = {0xbb, 0xcc, 0xdd, 0xee};
Jacob Bramley22023df2019-05-14 17:55:43 +0100441 int za_inputs[] = {-39, 1, -3, 2};
442 int zn_inputs[] = {-5, -20, 9, 8};
443 int zm_inputs[] = {9, -5, 4, 5};
444
445 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
446 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
447 ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
448 ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
449
450 // TODO: Use a simple `Dup` once it accepts arbitrary immediates.
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100451 InsrHelper(&masm, zd, zd_inputs);
Jacob Bramley22023df2019-05-14 17:55:43 +0100452 InsrHelper(&masm, za, za_inputs);
453 InsrHelper(&masm, zn, zn_inputs);
454 InsrHelper(&masm, zm, zm_inputs);
455
456 int p0_inputs[] = {1, 1, 0, 1};
457 int p1_inputs[] = {1, 0, 1, 1};
458 int p2_inputs[] = {0, 1, 1, 1};
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100459 int p3_inputs[] = {1, 1, 1, 0};
Jacob Bramley22023df2019-05-14 17:55:43 +0100460
461 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), p0_inputs);
462 Initialise(&masm, p1.WithLaneSize(lane_size_in_bits), p1_inputs);
463 Initialise(&masm, p2.WithLaneSize(lane_size_in_bits), p2_inputs);
464 Initialise(&masm, p3.WithLaneSize(lane_size_in_bits), p3_inputs);
465
466 // The Mla macro automatically selects between mla, mad and movprfx + mla
467 // based on what registers are aliased.
468 ZRegister mla_da_result = z10.WithLaneSize(lane_size_in_bits);
469 ZRegister mla_dn_result = z11.WithLaneSize(lane_size_in_bits);
470 ZRegister mla_dm_result = z12.WithLaneSize(lane_size_in_bits);
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100471 ZRegister mla_d_result = z13.WithLaneSize(lane_size_in_bits);
Jacob Bramley22023df2019-05-14 17:55:43 +0100472
473 __ Mov(mla_da_result, za);
474 __ Mla(mla_da_result, p0.Merging(), mla_da_result, zn, zm);
475
476 __ Mov(mla_dn_result, zn);
477 __ Mla(mla_dn_result, p1.Merging(), za, mla_dn_result, zm);
478
479 __ Mov(mla_dm_result, zm);
480 __ Mla(mla_dm_result, p2.Merging(), za, zn, mla_dm_result);
481
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100482 __ Mov(mla_d_result, zd);
483 __ Mla(mla_d_result, p3.Merging(), za, zn, zm);
Jacob Bramley22023df2019-05-14 17:55:43 +0100484
485 // The Mls macro automatically selects between mls, msb and movprfx + mls
486 // based on what registers are aliased.
487 ZRegister mls_da_result = z20.WithLaneSize(lane_size_in_bits);
488 ZRegister mls_dn_result = z21.WithLaneSize(lane_size_in_bits);
489 ZRegister mls_dm_result = z22.WithLaneSize(lane_size_in_bits);
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100490 ZRegister mls_d_result = z23.WithLaneSize(lane_size_in_bits);
Jacob Bramley22023df2019-05-14 17:55:43 +0100491
492 __ Mov(mls_da_result, za);
493 __ Mls(mls_da_result, p0.Merging(), mls_da_result, zn, zm);
494
495 __ Mov(mls_dn_result, zn);
496 __ Mls(mls_dn_result, p1.Merging(), za, mls_dn_result, zm);
497
498 __ Mov(mls_dm_result, zm);
499 __ Mls(mls_dm_result, p2.Merging(), za, zn, mls_dm_result);
500
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100501 __ Mov(mls_d_result, zd);
502 __ Mls(mls_d_result, p3.Merging(), za, zn, zm);
Jacob Bramley22023df2019-05-14 17:55:43 +0100503
504 END();
505
506 if (CAN_RUN()) {
507 RUN();
508
509 ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
510 ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits));
511 ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits));
512
513 int mla[] = {-84, 101, 33, 42};
514 int mls[] = {6, -99, -39, -38};
515
516 int mla_da_expected[] = {mla[0], mla[1], za_inputs[2], mla[3]};
517 ASSERT_EQUAL_SVE(mla_da_expected, mla_da_result);
518
519 int mla_dn_expected[] = {mla[0], zn_inputs[1], mla[2], mla[3]};
520 ASSERT_EQUAL_SVE(mla_dn_expected, mla_dn_result);
521
522 int mla_dm_expected[] = {zm_inputs[0], mla[1], mla[2], mla[3]};
523 ASSERT_EQUAL_SVE(mla_dm_expected, mla_dm_result);
524
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100525 int mla_d_expected[] = {mla[0], mla[1], mla[2], zd_inputs[3]};
526 ASSERT_EQUAL_SVE(mla_d_expected, mla_d_result);
Jacob Bramley22023df2019-05-14 17:55:43 +0100527
528 int mls_da_expected[] = {mls[0], mls[1], za_inputs[2], mls[3]};
529 ASSERT_EQUAL_SVE(mls_da_expected, mls_da_result);
530
531 int mls_dn_expected[] = {mls[0], zn_inputs[1], mls[2], mls[3]};
532 ASSERT_EQUAL_SVE(mls_dn_expected, mls_dn_result);
533
534 int mls_dm_expected[] = {zm_inputs[0], mls[1], mls[2], mls[3]};
535 ASSERT_EQUAL_SVE(mls_dm_expected, mls_dm_result);
536
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100537 int mls_d_expected[] = {mls[0], mls[1], mls[2], zd_inputs[3]};
538 ASSERT_EQUAL_SVE(mls_d_expected, mls_d_result);
Jacob Bramley22023df2019-05-14 17:55:43 +0100539 }
540}
541
Jacob Bramleye8289202019-07-31 11:25:23 +0100542TEST_SVE(sve_mla_mls_b) { MlaMlsHelper(config, kBRegSize); }
543TEST_SVE(sve_mla_mls_h) { MlaMlsHelper(config, kHRegSize); }
544TEST_SVE(sve_mla_mls_s) { MlaMlsHelper(config, kSRegSize); }
545TEST_SVE(sve_mla_mls_d) { MlaMlsHelper(config, kDRegSize); }
Jacob Bramley22023df2019-05-14 17:55:43 +0100546
Jacob Bramleye8289202019-07-31 11:25:23 +0100547TEST_SVE(sve_bitwise_unpredicate_logical) {
548 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chongcfb94212019-05-16 13:30:09 -0700549 START();
550
551 uint64_t z8_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
552 InsrHelper(&masm, z8.VnD(), z8_inputs);
553 uint64_t z15_inputs[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff};
554 InsrHelper(&masm, z15.VnD(), z15_inputs);
555
556 __ And(z1.VnD(), z8.VnD(), z15.VnD());
557 __ Bic(z2.VnD(), z8.VnD(), z15.VnD());
558 __ Eor(z3.VnD(), z8.VnD(), z15.VnD());
559 __ Orr(z4.VnD(), z8.VnD(), z15.VnD());
560
561 END();
562
563 if (CAN_RUN()) {
564 RUN();
565 uint64_t z1_expected[] = {0xfedcaa8854540000, 0x0000454588aacdef};
566 uint64_t z2_expected[] = {0x0000101022003210, 0x0123002201010000};
567 uint64_t z3_expected[] = {0x01235476ab89fedc, 0xcdef98ba67453210};
568 uint64_t z4_expected[] = {0xfffffefeffddfedc, 0xcdefddffefefffff};
569
570 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
571 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
572 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
573 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
574 }
TatWai Chongcfb94212019-05-16 13:30:09 -0700575}
576
Martyn Capewellf804b602020-02-24 18:57:18 +0000577TEST_SVE(sve_last_r) {
578 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
579 START();
580
581 __ Pfalse(p1.VnB());
582 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
583 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
584 Initialise(&masm, p2.VnB(), p2_inputs);
585 Initialise(&masm, p3.VnB(), p3_inputs);
586 __ Ptrue(p4.VnB());
587
588 __ Index(z0.VnB(), 0x10, 1);
589 __ Lasta(x1, p1, z0.VnB());
590 __ Lastb(x2, p1, z0.VnB());
591 __ Lasta(x3, p2, z0.VnB());
592 __ Lastb(x4, p2, z0.VnB());
593 __ Lasta(x5, p3, z0.VnB());
594 __ Lastb(x6, p3, z0.VnB());
595 __ Lasta(x7, p4, z0.VnB());
596
597 __ Punpklo(p3.VnH(), p3.VnB());
598 __ Index(z0.VnH(), 0x1110, 1);
599 __ Lasta(x9, p1, z0.VnH());
600 __ Lastb(x10, p3, z0.VnH());
601 __ Lasta(x12, p4, z0.VnH());
602
603 __ Index(z0.VnS(), 0x11111110, 1);
604 __ Lastb(x13, p1, z0.VnS());
605 __ Lasta(x14, p2, z0.VnS());
606 __ Lastb(x18, p4, z0.VnS());
607
608 __ Index(z0.VnD(), 0x1111111111111110, 1);
609 __ Lasta(x19, p1, z0.VnD());
610 __ Lastb(x20, p3, z0.VnD());
611 __ Lasta(x21, p3, z0.VnD());
612 END();
613
614 if (CAN_RUN()) {
615 RUN();
616
617 ASSERT_EQUAL_64(0x0000000000000010, x1);
618 ASSERT_EQUAL_64(0x0000000000000011, x3);
619 ASSERT_EQUAL_64(0x0000000000000010, x4);
620 ASSERT_EQUAL_64(0x0000000000000019, x5);
621 ASSERT_EQUAL_64(0x0000000000000018, x6);
622 ASSERT_EQUAL_64(0x0000000000000010, x7);
623 ASSERT_EQUAL_64(0x0000000000001110, x9);
624 ASSERT_EQUAL_64(0x0000000000001110, x12);
625 ASSERT_EQUAL_64(0x0000000011111111, x14);
626 ASSERT_EQUAL_64(0x1111111111111110, x19);
627
628 int vl = core.GetSVELaneCount(kBRegSize) * 8;
629 switch (vl) {
630 case 128:
631 ASSERT_EQUAL_64(0x000000000000001f, x2);
632 ASSERT_EQUAL_64(0x0000000000001116, x10);
633 ASSERT_EQUAL_64(0x0000000011111113, x13);
634 ASSERT_EQUAL_64(0x0000000011111113, x18);
635 ASSERT_EQUAL_64(0x1111111111111111, x20);
636 ASSERT_EQUAL_64(0x1111111111111110, x21);
637 break;
638 case 384:
639 ASSERT_EQUAL_64(0x000000000000003f, x2);
640 ASSERT_EQUAL_64(0x0000000000001118, x10);
641 ASSERT_EQUAL_64(0x000000001111111b, x13);
642 ASSERT_EQUAL_64(0x000000001111111b, x18);
643 ASSERT_EQUAL_64(0x1111111111111112, x20);
644 ASSERT_EQUAL_64(0x1111111111111113, x21);
645 break;
646 case 2048:
647 ASSERT_EQUAL_64(0x000000000000000f, x2);
648 ASSERT_EQUAL_64(0x0000000000001118, x10);
649 ASSERT_EQUAL_64(0x000000001111114f, x13);
650 ASSERT_EQUAL_64(0x000000001111114f, x18);
651 ASSERT_EQUAL_64(0x1111111111111112, x20);
652 ASSERT_EQUAL_64(0x1111111111111113, x21);
653 break;
654 default:
655 printf("WARNING: Some tests skipped due to unexpected VL.\n");
656 break;
657 }
658 }
659}
660
661TEST_SVE(sve_last_v) {
662 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
663 START();
664
665 __ Pfalse(p1.VnB());
666 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
667 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
668 Initialise(&masm, p2.VnB(), p2_inputs);
669 Initialise(&masm, p3.VnB(), p3_inputs);
670 __ Ptrue(p4.VnB());
671
672 __ Index(z0.VnB(), 0x10, 1);
673 __ Lasta(b1, p1, z0.VnB());
674 __ Lastb(b2, p1, z0.VnB());
675 __ Lasta(b3, p2, z0.VnB());
676 __ Lastb(b4, p2, z0.VnB());
677 __ Lasta(b5, p3, z0.VnB());
678 __ Lastb(b6, p3, z0.VnB());
679 __ Lasta(b7, p4, z0.VnB());
680
681 __ Punpklo(p3.VnH(), p3.VnB());
682 __ Index(z0.VnH(), 0x1110, 1);
683 __ Lasta(h9, p1, z0.VnH());
684 __ Lastb(h10, p3, z0.VnH());
685 __ Lasta(h12, p4, z0.VnH());
686
687 __ Index(z0.VnS(), 0x11111110, 1);
688 __ Lastb(s13, p1, z0.VnS());
689 __ Lasta(s14, p2, z0.VnS());
690 __ Lastb(s18, p4, z0.VnS());
691
692 __ Index(z0.VnD(), 0x1111111111111110, 1);
693 __ Lasta(d19, p1, z0.VnD());
694 __ Lastb(d20, p3, z0.VnD());
695 __ Lasta(d21, p3, z0.VnD());
696 END();
697
698 if (CAN_RUN()) {
699 RUN();
700
701 ASSERT_EQUAL_128(0, 0x0000000000000010, q1);
702 ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
703 ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
704 ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
705 ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
706 ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
707 ASSERT_EQUAL_128(0, 0x0000000000001110, q9);
708 ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
709 ASSERT_EQUAL_128(0, 0x0000000011111111, q14);
710 ASSERT_EQUAL_128(0, 0x1111111111111110, q19);
711
712 int vl = core.GetSVELaneCount(kBRegSize) * 8;
713 switch (vl) {
714 case 128:
715 ASSERT_EQUAL_128(0, 0x000000000000001f, q2);
716 ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
717 ASSERT_EQUAL_128(0, 0x0000000011111113, q13);
718 ASSERT_EQUAL_128(0, 0x0000000011111113, q18);
719 ASSERT_EQUAL_128(0, 0x1111111111111111, q20);
720 ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
721 break;
722 case 384:
723 ASSERT_EQUAL_128(0, 0x000000000000003f, q2);
724 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
725 ASSERT_EQUAL_128(0, 0x000000001111111b, q13);
726 ASSERT_EQUAL_128(0, 0x000000001111111b, q18);
727 ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
728 ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
729 break;
730 case 2048:
731 ASSERT_EQUAL_128(0, 0x000000000000000f, q2);
732 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
733 ASSERT_EQUAL_128(0, 0x000000001111114f, q13);
734 ASSERT_EQUAL_128(0, 0x000000001111114f, q18);
735 ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
736 ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
737 break;
738 default:
739 printf("WARNING: Some tests skipped due to unexpected VL.\n");
740 break;
741 }
742 }
743}
744
745TEST_SVE(sve_clast_r) {
746 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
747 START();
748
749 __ Pfalse(p1.VnB());
750 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
751 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
752 Initialise(&masm, p2.VnB(), p2_inputs);
753 Initialise(&masm, p3.VnB(), p3_inputs);
754 __ Ptrue(p4.VnB());
755
756 __ Index(z0.VnB(), 0x10, 1);
757 __ Mov(x1, -1);
758 __ Mov(x2, -1);
759 __ Clasta(x1, p1, x1, z0.VnB());
760 __ Clastb(x2, p1, x2, z0.VnB());
761 __ Clasta(x3, p2, x3, z0.VnB());
762 __ Clastb(x4, p2, x4, z0.VnB());
763 __ Clasta(x5, p3, x5, z0.VnB());
764 __ Clastb(x6, p3, x6, z0.VnB());
765 __ Clasta(x7, p4, x7, z0.VnB());
766
767 __ Punpklo(p3.VnH(), p3.VnB());
768 __ Index(z0.VnH(), 0x1110, 1);
769 __ Mov(x9, -1);
770 __ Clasta(x9, p1, x9, z0.VnH());
771 __ Clastb(x10, p3, x10, z0.VnH());
772 __ Clasta(x12, p4, x12, z0.VnH());
773
774 __ Index(z0.VnS(), 0x11111110, 1);
775 __ Mov(x13, -1);
776 __ Clasta(x13, p1, x13, z0.VnS());
777 __ Clastb(x14, p2, x14, z0.VnS());
778 __ Clasta(x18, p4, x18, z0.VnS());
779
780 __ Index(z0.VnD(), 0x1111111111111110, 1);
781 __ Mov(x19, -1);
782 __ Clasta(x19, p1, x19, z0.VnD());
783 __ Clastb(x20, p2, x20, z0.VnD());
784 __ Clasta(x21, p4, x21, z0.VnD());
785 END();
786
787 if (CAN_RUN()) {
788 RUN();
789 ASSERT_EQUAL_64(0x00000000000000ff, x1);
790 ASSERT_EQUAL_64(0x00000000000000ff, x2);
791 ASSERT_EQUAL_64(0x0000000000000011, x3);
792 ASSERT_EQUAL_64(0x0000000000000010, x4);
793 ASSERT_EQUAL_64(0x0000000000000019, x5);
794 ASSERT_EQUAL_64(0x0000000000000018, x6);
795 ASSERT_EQUAL_64(0x0000000000000010, x7);
796 ASSERT_EQUAL_64(0x000000000000ffff, x9);
797 ASSERT_EQUAL_64(0x0000000000001110, x12);
798 ASSERT_EQUAL_64(0x00000000ffffffff, x13);
799 ASSERT_EQUAL_64(0x0000000011111110, x14);
800 ASSERT_EQUAL_64(0x0000000011111110, x18);
801 ASSERT_EQUAL_64(0xffffffffffffffff, x19);
802 ASSERT_EQUAL_64(0x1111111111111110, x20);
803 ASSERT_EQUAL_64(0x1111111111111110, x21);
804
805 int vl = core.GetSVELaneCount(kBRegSize) * 8;
806 switch (vl) {
807 case 128:
808 ASSERT_EQUAL_64(0x0000000000001116, x10);
809 break;
810 case 384:
811 ASSERT_EQUAL_64(0x0000000000001118, x10);
812 break;
813 case 2048:
814 ASSERT_EQUAL_64(0x0000000000001118, x10);
815 break;
816 default:
817 printf("WARNING: Some tests skipped due to unexpected VL.\n");
818 break;
819 }
820 }
821}
822
823TEST_SVE(sve_clast_v) {
824 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
825 START();
826
827 __ Pfalse(p1.VnB());
828 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
829 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
830 Initialise(&masm, p2.VnB(), p2_inputs);
831 Initialise(&masm, p3.VnB(), p3_inputs);
832 __ Ptrue(p4.VnB());
833
834 __ Index(z0.VnB(), 0x10, 1);
835 __ Dup(z1.VnB(), -1);
836 __ Dup(z2.VnB(), -1);
837 __ Clasta(b1, p1, b1, z0.VnB());
838 __ Clastb(b2, p1, b2, z0.VnB());
839 __ Clasta(b3, p2, b3, z0.VnB());
840 __ Clastb(b4, p2, b4, z0.VnB());
841 __ Clasta(b5, p3, b5, z0.VnB());
842 __ Clastb(b6, p3, b6, z0.VnB());
843 __ Clasta(b7, p4, b7, z0.VnB());
844
845 __ Punpklo(p3.VnH(), p3.VnB());
846 __ Index(z0.VnH(), 0x1110, 1);
847 __ Dup(z9.VnB(), -1);
848 __ Clasta(h9, p1, h9, z0.VnH());
849 __ Clastb(h10, p3, h10, z0.VnH());
850 __ Clasta(h12, p4, h12, z0.VnH());
851
852 __ Index(z0.VnS(), 0x11111110, 1);
853 __ Dup(z13.VnB(), -1);
854 __ Clasta(s13, p1, s13, z0.VnS());
855 __ Clastb(s14, p2, s14, z0.VnS());
856 __ Clasta(s18, p4, s18, z0.VnS());
857
858 __ Index(z0.VnD(), 0x1111111111111110, 1);
859 __ Dup(z19.VnB(), -1);
860 __ Clasta(d19, p1, d19, z0.VnD());
861 __ Clastb(d20, p2, d20, z0.VnD());
862 __ Clasta(d21, p4, d21, z0.VnD());
863 END();
864
865 if (CAN_RUN()) {
866 RUN();
867 ASSERT_EQUAL_128(0, 0x00000000000000ff, q1);
868 ASSERT_EQUAL_128(0, 0x00000000000000ff, q2);
869 ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
870 ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
871 ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
872 ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
873 ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
874 ASSERT_EQUAL_128(0, 0x000000000000ffff, q9);
875 ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
876 ASSERT_EQUAL_128(0, 0x00000000ffffffff, q13);
877 ASSERT_EQUAL_128(0, 0x0000000011111110, q14);
878 ASSERT_EQUAL_128(0, 0x0000000011111110, q18);
879 ASSERT_EQUAL_128(0, 0xffffffffffffffff, q19);
880 ASSERT_EQUAL_128(0, 0x1111111111111110, q20);
881 ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
882
883 int vl = core.GetSVELaneCount(kBRegSize) * 8;
884 switch (vl) {
885 case 128:
886 ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
887 break;
888 case 384:
889 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
890 break;
891 case 2048:
892 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
893 break;
894 default:
895 printf("WARNING: Some tests skipped due to unexpected VL.\n");
896 break;
897 }
898 }
899}
900
901TEST_SVE(sve_clast_z) {
902 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
903 START();
904
905 __ Pfalse(p1.VnB());
906 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
907 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
908 Initialise(&masm, p2.VnB(), p2_inputs);
909 Initialise(&masm, p3.VnB(), p3_inputs);
910 __ Ptrue(p4.VnB());
911
912 __ Index(z0.VnB(), 0x10, 1);
913 __ Dup(z1.VnB(), 0xff);
914 __ Dup(z2.VnB(), 0xff);
915 __ Clasta(z1.VnB(), p1, z1.VnB(), z0.VnB());
916 __ Clastb(z2.VnB(), p1, z2.VnB(), z0.VnB());
917 __ Clasta(z3.VnB(), p2, z3.VnB(), z0.VnB());
918 __ Clastb(z4.VnB(), p2, z4.VnB(), z0.VnB());
919 __ Clasta(z5.VnB(), p3, z5.VnB(), z0.VnB());
920 __ Clastb(z6.VnB(), p3, z6.VnB(), z0.VnB());
921 __ Clasta(z7.VnB(), p4, z7.VnB(), z0.VnB());
922
923 __ Punpklo(p3.VnH(), p3.VnB());
924 __ Index(z0.VnH(), 0x1110, 1);
925 __ Dup(z9.VnB(), 0xff);
926 __ Clasta(z9.VnH(), p1, z9.VnH(), z0.VnH());
927 __ Clastb(z10.VnH(), p3, z10.VnH(), z0.VnH());
928 __ Clasta(z12.VnH(), p4, z12.VnH(), z0.VnH());
929
930 __ Index(z0.VnS(), 0x11111110, 1);
931 __ Dup(z13.VnB(), 0xff);
932 __ Clasta(z13.VnS(), p1, z13.VnS(), z0.VnS());
933 __ Clastb(z14.VnS(), p2, z14.VnS(), z0.VnS());
934 __ Clasta(z16.VnS(), p4, z16.VnS(), z0.VnS());
935
936 __ Index(z0.VnD(), 0x1111111111111110, 1);
937 __ Dup(z17.VnB(), 0xff);
938 __ Clasta(z17.VnD(), p1, z17.VnD(), z0.VnD());
939 __ Clastb(z18.VnD(), p2, z18.VnD(), z0.VnD());
940 __ Clasta(z20.VnD(), p4, z20.VnD(), z0.VnD());
941 END();
942
943 if (CAN_RUN()) {
944 RUN();
945 uint64_t z1_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
946 uint64_t z2_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
947 uint64_t z3_expected[] = {0x1111111111111111, 0x1111111111111111};
948 uint64_t z4_expected[] = {0x1010101010101010, 0x1010101010101010};
949 uint64_t z5_expected[] = {0x1919191919191919, 0x1919191919191919};
950 uint64_t z6_expected[] = {0x1818181818181818, 0x1818181818181818};
951 uint64_t z7_expected[] = {0x1010101010101010, 0x1010101010101010};
952 uint64_t z9_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
953 uint64_t z12_expected[] = {0x1110111011101110, 0x1110111011101110};
954 uint64_t z13_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
955 uint64_t z14_expected[] = {0x1111111011111110, 0x1111111011111110};
956 uint64_t z16_expected[] = {0x1111111011111110, 0x1111111011111110};
957 uint64_t z17_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
958 uint64_t z18_expected[] = {0x1111111111111110, 0x1111111111111110};
959 uint64_t z20_expected[] = {0x1111111111111110, 0x1111111111111110};
960
961 uint64_t z10_expected_vl128[] = {0x1116111611161116, 0x1116111611161116};
962 uint64_t z10_expected_vl_long[] = {0x1118111811181118, 0x1118111811181118};
963
964 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
965 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
966 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
967 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
968 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
969 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
970 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
971 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
972 ASSERT_EQUAL_SVE(z12_expected, z12.VnD());
973 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
974 ASSERT_EQUAL_SVE(z14_expected, z14.VnD());
975 ASSERT_EQUAL_SVE(z16_expected, z16.VnD());
976 ASSERT_EQUAL_SVE(z17_expected, z17.VnD());
977 ASSERT_EQUAL_SVE(z18_expected, z18.VnD());
978 ASSERT_EQUAL_SVE(z20_expected, z20.VnD());
979
980 int vl = core.GetSVELaneCount(kBRegSize) * 8;
981 switch (vl) {
982 case 128:
983 ASSERT_EQUAL_SVE(z10_expected_vl128, z10.VnD());
984 break;
985 case 384:
986 case 2048:
987 ASSERT_EQUAL_SVE(z10_expected_vl_long, z10.VnD());
988 break;
989 default:
990 printf("WARNING: Some tests skipped due to unexpected VL.\n");
991 break;
992 }
993 }
994}
995
996TEST_SVE(sve_compact) {
997 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
998 START();
999
1000 __ Ptrue(p0.VnB());
1001 __ Pfalse(p1.VnB());
1002 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
1003 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
1004 __ Zip1(p4.VnD(), p0.VnD(), p1.VnD());
1005
1006 __ Index(z0.VnS(), 0x11111111, 0x11111111);
1007 __ Mov(q0, q0);
1008 __ Compact(z1.VnS(), p0, z0.VnS());
1009 __ Compact(z2.VnS(), p2, z0.VnS());
1010 __ Compact(z0.VnS(), p3, z0.VnS());
1011
1012 __ Index(z3.VnD(), 0x1111111111111111, 0x1111111111111111);
1013 __ Mov(q3, q3);
1014 __ Compact(z4.VnD(), p0, z3.VnD());
1015 __ Compact(z5.VnD(), p1, z3.VnD());
1016 __ Compact(z6.VnD(), p4, z3.VnD());
1017
1018 END();
1019
1020 if (CAN_RUN()) {
1021 RUN();
1022 uint64_t z1_expected[] = {0x4444444433333333, 0x2222222211111111};
1023 uint64_t z2_expected[] = {0x0000000000000000, 0x3333333311111111};
1024 uint64_t z0_expected[] = {0x0000000000000000, 0x4444444422222222};
1025 uint64_t z4_expected[] = {0x2222222222222222, 0x1111111111111111};
1026 uint64_t z5_expected[] = {0x0000000000000000, 0x0000000000000000};
1027 uint64_t z6_expected[] = {0x0000000000000000, 0x1111111111111111};
1028 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1029 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
1030 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1031 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1032 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1033 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
1034 }
1035}
1036
1037TEST_SVE(sve_splice) {
1038 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
1039 START();
1040
1041 __ Ptrue(p0.VnB());
1042 __ Pfalse(p1.VnB());
1043 int p2b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
1044 int p3b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
1045 int p4b_inputs[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1046 int p5b_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0};
1047 int p6b_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0};
1048 Initialise(&masm, p2.VnB(), p2b_inputs);
1049 Initialise(&masm, p3.VnB(), p3b_inputs);
1050 Initialise(&masm, p4.VnB(), p4b_inputs);
1051 Initialise(&masm, p5.VnB(), p5b_inputs);
1052 Initialise(&masm, p6.VnB(), p6b_inputs);
1053
1054 __ Index(z30.VnB(), 1, 1);
1055
1056 __ Index(z0.VnB(), -1, -1);
1057 __ Splice(z0.VnB(), p0, z0.VnB(), z30.VnB());
1058 __ Index(z1.VnB(), -1, -1);
1059 __ Splice(z1.VnB(), p1, z1.VnB(), z30.VnB());
1060 __ Index(z2.VnB(), -1, -1);
1061 __ Splice(z2.VnB(), p2, z2.VnB(), z30.VnB());
1062 __ Index(z3.VnB(), -1, -1);
1063 __ Splice(z3.VnB(), p3, z3.VnB(), z30.VnB());
1064 __ Index(z4.VnB(), -1, -1);
1065 __ Splice(z4.VnB(), p4, z4.VnB(), z30.VnB());
1066 __ Index(z5.VnB(), -1, -1);
1067 __ Splice(z5.VnB(), p5, z5.VnB(), z30.VnB());
1068 __ Index(z6.VnB(), -1, -1);
1069 __ Splice(z6.VnB(), p6, z6.VnB(), z30.VnB());
1070
1071 int p2h_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0};
1072 int p3h_inputs[] = {0, 0, 1, 0, 0, 0, 1, 0};
1073 Initialise(&masm, p2.VnH(), p2h_inputs);
1074 Initialise(&masm, p3.VnH(), p3h_inputs);
1075
1076 __ Index(z30.VnH(), 1, 1);
1077 __ Index(z29.VnH(), -1, -1);
1078 __ Splice(z7.VnH(), p2, z29.VnH(), z30.VnH());
1079 __ Splice(z8.VnH(), p3, z29.VnH(), z30.VnH());
1080
1081 int p2s_inputs[] = {0, 0, 1, 0};
1082 int p3s_inputs[] = {1, 0, 1, 0};
1083 Initialise(&masm, p2.VnS(), p2s_inputs);
1084 Initialise(&masm, p3.VnS(), p3s_inputs);
1085
1086 __ Index(z30.VnS(), 1, 1);
1087 __ Index(z29.VnS(), -1, -1);
1088 __ Splice(z9.VnS(), p2, z29.VnS(), z30.VnS());
1089 __ Splice(z10.VnS(), p3, z29.VnS(), z30.VnS());
1090
1091 int p2d_inputs[] = {0, 1};
1092 int p3d_inputs[] = {1, 0};
1093 Initialise(&masm, p2.VnD(), p2d_inputs);
1094 Initialise(&masm, p3.VnD(), p3d_inputs);
1095
1096 __ Index(z30.VnD(), 1, 1);
1097 __ Index(z29.VnD(), -1, -1);
1098 __ Splice(z11.VnD(), p2, z29.VnD(), z30.VnD());
1099 __ Splice(z30.VnD(), p3, z29.VnD(), z30.VnD());
1100
1101 END();
1102
1103 if (CAN_RUN()) {
1104 RUN();
1105 uint64_t z0_expected[] = {0xf0f1f2f3f4f5f6f7, 0xf8f9fafbfcfdfeff};
1106 uint64_t z1_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
1107 uint64_t z2_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201ff};
1108 uint64_t z3_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201fe};
1109 uint64_t z4_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201f0};
1110 uint64_t z5_expected[] = {0x0c0b0a0908070605, 0x04030201f6f7f8f9};
1111 uint64_t z6_expected[] = {0x01f0f1f2f3f4f5f6, 0xf7f8f9fafbfcfdfe};
1112 uint64_t z7_expected[] = {0x0007000600050004, 0x000300020001fffe};
1113 uint64_t z8_expected[] = {0x000300020001fffa, 0xfffbfffcfffdfffe};
1114 uint64_t z9_expected[] = {0x0000000300000002, 0x00000001fffffffe};
1115 uint64_t z10_expected[] = {0x00000001fffffffc, 0xfffffffdfffffffe};
1116 uint64_t z11_expected[] = {0x0000000000000001, 0xffffffffffffffff};
1117 uint64_t z30_expected[] = {0x0000000000000001, 0xfffffffffffffffe};
1118
1119 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1120 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1121 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
1122 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
1123 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1124 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1125 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
1126 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
1127 ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
1128 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1129 ASSERT_EQUAL_SVE(z10_expected, z10.VnD());
1130 ASSERT_EQUAL_SVE(z11_expected, z11.VnD());
1131 ASSERT_EQUAL_SVE(z30_expected, z30.VnD());
1132 }
1133}
1134
Jacob Bramleye8289202019-07-31 11:25:23 +01001135TEST_SVE(sve_predicate_logical) {
1136 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chongf4fa8222019-06-17 12:08:14 -07001137 START();
1138
1139 // 0b...01011010'10110111
1140 int p10_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1}; // Pm
1141 // 0b...11011001'01010010
1142 int p11_inputs[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0}; // Pn
1143 // 0b...01010101'10110010
1144 int p12_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0}; // pg
1145
1146 Initialise(&masm, p10.VnB(), p10_inputs);
1147 Initialise(&masm, p11.VnB(), p11_inputs);
1148 Initialise(&masm, p12.VnB(), p12_inputs);
1149
1150 __ Ands(p0.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1151 __ Mrs(x0, NZCV);
1152 __ Bics(p1.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1153 __ Mrs(x1, NZCV);
1154 __ Eor(p2.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1155 __ Nand(p3.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1156 __ Nor(p4.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1157 __ Orn(p5.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1158 __ Orr(p6.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1159 __ Sel(p7.VnB(), p12, p11.VnB(), p10.VnB());
1160
1161 END();
1162
1163 if (CAN_RUN()) {
1164 RUN();
1165
1166 // 0b...01010000'00010010
1167 int p0_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
1168 // 0b...00000001'00000000
1169 int p1_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0};
1170 // 0b...00000001'10100000
1171 int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1172 // 0b...00000101'10100000
1173 int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1174 // 0b...00000100'00000000
1175 int p4_expected[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1176 // 0b...01010101'00010010
1177 int p5_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
1178 // 0b...01010001'10110010
1179 int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
1180 // 0b...01011011'00010111
1181 int p7_expected[] = {0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1};
1182
1183 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
1184 ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
1185 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1186 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1187 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
1188 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
1189 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
1190 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
1191
TatWai Chong96713fe2019-06-04 16:39:37 -07001192 ASSERT_EQUAL_32(SVEFirstFlag, w0);
1193 ASSERT_EQUAL_32(SVENotLastFlag, w1);
1194 }
1195}
TatWai Chongf4fa8222019-06-17 12:08:14 -07001196
Jacob Bramleye8289202019-07-31 11:25:23 +01001197TEST_SVE(sve_int_compare_vectors) {
1198 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chong96713fe2019-06-04 16:39:37 -07001199 START();
1200
1201 int z10_inputs[] = {0x00, 0x80, 0xff, 0x7f, 0x00, 0x00, 0x00, 0xff};
1202 int z11_inputs[] = {0x00, 0x00, 0x00, 0x00, 0x80, 0xff, 0x7f, 0xfe};
1203 int p0_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1204 InsrHelper(&masm, z10.VnB(), z10_inputs);
1205 InsrHelper(&masm, z11.VnB(), z11_inputs);
1206 Initialise(&masm, p0.VnB(), p0_inputs);
1207
1208 __ Cmphs(p6.VnB(), p0.Zeroing(), z10.VnB(), z11.VnB());
1209 __ Mrs(x6, NZCV);
1210
1211 uint64_t z12_inputs[] = {0xffffffffffffffff, 0x8000000000000000};
1212 uint64_t z13_inputs[] = {0x0000000000000000, 0x8000000000000000};
1213 int p1_inputs[] = {1, 1};
1214 InsrHelper(&masm, z12.VnD(), z12_inputs);
1215 InsrHelper(&masm, z13.VnD(), z13_inputs);
1216 Initialise(&masm, p1.VnD(), p1_inputs);
1217
1218 __ Cmphi(p7.VnD(), p1.Zeroing(), z12.VnD(), z13.VnD());
1219 __ Mrs(x7, NZCV);
1220
1221 int z14_inputs[] = {0, 32767, -1, -32767, 0, 0, 0, 32766};
1222 int z15_inputs[] = {0, 0, 0, 0, 32767, -1, -32767, 32767};
1223
1224 int p2_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1225 InsrHelper(&masm, z14.VnH(), z14_inputs);
1226 InsrHelper(&masm, z15.VnH(), z15_inputs);
1227 Initialise(&masm, p2.VnH(), p2_inputs);
1228
1229 __ Cmpge(p8.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1230 __ Mrs(x8, NZCV);
1231
1232 __ Cmpeq(p9.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1233 __ Mrs(x9, NZCV);
1234
1235 int z16_inputs[] = {0, -1, 0, 0};
1236 int z17_inputs[] = {0, 0, 2147483647, -2147483648};
1237 int p3_inputs[] = {1, 1, 1, 1};
1238 InsrHelper(&masm, z16.VnS(), z16_inputs);
1239 InsrHelper(&masm, z17.VnS(), z17_inputs);
1240 Initialise(&masm, p3.VnS(), p3_inputs);
1241
1242 __ Cmpgt(p10.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1243 __ Mrs(x10, NZCV);
1244
1245 __ Cmpne(p11.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1246 __ Mrs(x11, NZCV);
1247
1248 // Architectural aliases testing.
1249 __ Cmpls(p12.VnB(), p0.Zeroing(), z11.VnB(), z10.VnB()); // HS
1250 __ Cmplo(p13.VnD(), p1.Zeroing(), z13.VnD(), z12.VnD()); // HI
1251 __ Cmple(p14.VnH(), p2.Zeroing(), z15.VnH(), z14.VnH()); // GE
1252 __ Cmplt(p15.VnS(), p3.Zeroing(), z17.VnS(), z16.VnS()); // GT
1253
1254 END();
1255
1256 if (CAN_RUN()) {
1257 RUN();
1258
1259 int p6_expected[] = {1, 0, 1, 1, 0, 0, 0, 1};
1260 for (size_t i = 0; i < ArrayLength(p6_expected); i++) {
1261 int lane = static_cast<int>(ArrayLength(p6_expected) - i - 1);
1262 ASSERT_EQUAL_SVE_LANE(p6_expected[i], p6.VnB(), lane);
1263 }
1264
1265 int p7_expected[] = {1, 0};
1266 ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
1267
1268 int p8_expected[] = {1, 0, 0, 0, 0, 1, 1, 0};
1269 ASSERT_EQUAL_SVE(p8_expected, p8.VnH());
1270
1271 int p9_expected[] = {1, 0, 0, 0, 0, 0, 0, 0};
1272 ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
1273
1274 int p10_expected[] = {0, 0, 0, 1};
1275 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1276
1277 int p11_expected[] = {0, 1, 1, 1};
1278 ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1279
1280 // Reuse the expected results to verify the architectural aliases.
1281 ASSERT_EQUAL_SVE(p6_expected, p12.VnB());
1282 ASSERT_EQUAL_SVE(p7_expected, p13.VnD());
1283 ASSERT_EQUAL_SVE(p8_expected, p14.VnH());
1284 ASSERT_EQUAL_SVE(p10_expected, p15.VnS());
1285
1286 ASSERT_EQUAL_32(SVEFirstFlag, w6);
1287 ASSERT_EQUAL_32(NoFlag, w7);
1288 ASSERT_EQUAL_32(NoFlag, w8);
1289 ASSERT_EQUAL_32(NoFlag, w9);
1290 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
1291 }
1292}
1293
Jacob Bramleye8289202019-07-31 11:25:23 +01001294TEST_SVE(sve_int_compare_vectors_wide_elements) {
1295 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chong96713fe2019-06-04 16:39:37 -07001296 START();
1297
1298 int src1_inputs_1[] = {0, 1, -1, -128, 127, 100, -66};
1299 int src2_inputs_1[] = {0, -1};
1300 int mask_inputs_1[] = {1, 1, 1, 1, 1, 0, 1};
1301 InsrHelper(&masm, z13.VnB(), src1_inputs_1);
1302 InsrHelper(&masm, z19.VnD(), src2_inputs_1);
1303 Initialise(&masm, p0.VnB(), mask_inputs_1);
1304
1305 __ Cmpge(p2.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1306 __ Mrs(x2, NZCV);
1307 __ Cmpgt(p3.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1308 __ Mrs(x3, NZCV);
1309
1310 int src1_inputs_2[] = {0, 32767, -1, -32767, 1, 1234, 0, 32766};
1311 int src2_inputs_2[] = {0, -32767};
1312 int mask_inputs_2[] = {1, 0, 1, 1, 1, 1, 1, 1};
1313 InsrHelper(&masm, z13.VnH(), src1_inputs_2);
1314 InsrHelper(&masm, z19.VnD(), src2_inputs_2);
1315 Initialise(&masm, p0.VnH(), mask_inputs_2);
1316
1317 __ Cmple(p4.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1318 __ Mrs(x4, NZCV);
1319 __ Cmplt(p5.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1320 __ Mrs(x5, NZCV);
1321
1322 int src1_inputs_3[] = {0, -1, 2147483647, -2147483648};
1323 int src2_inputs_3[] = {0, -2147483648};
1324 int mask_inputs_3[] = {1, 1, 1, 1};
1325 InsrHelper(&masm, z13.VnS(), src1_inputs_3);
1326 InsrHelper(&masm, z19.VnD(), src2_inputs_3);
1327 Initialise(&masm, p0.VnS(), mask_inputs_3);
1328
1329 __ Cmpeq(p6.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1330 __ Mrs(x6, NZCV);
1331 __ Cmpne(p7.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1332 __ Mrs(x7, NZCV);
1333
1334 int src1_inputs_4[] = {0x00, 0x80, 0x7f, 0xff, 0x7f, 0xf0, 0x0f, 0x55};
1335 int src2_inputs_4[] = {0x00, 0x7f};
1336 int mask_inputs_4[] = {1, 1, 1, 1, 0, 1, 1, 1};
1337 InsrHelper(&masm, z13.VnB(), src1_inputs_4);
1338 InsrHelper(&masm, z19.VnD(), src2_inputs_4);
1339 Initialise(&masm, p0.VnB(), mask_inputs_4);
1340
1341 __ Cmplo(p8.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1342 __ Mrs(x8, NZCV);
1343 __ Cmpls(p9.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1344 __ Mrs(x9, NZCV);
1345
1346 int src1_inputs_5[] = {0x0000, 0x8000, 0x7fff, 0xffff};
1347 int src2_inputs_5[] = {0x8000, 0xffff};
1348 int mask_inputs_5[] = {1, 1, 1, 1};
1349 InsrHelper(&masm, z13.VnS(), src1_inputs_5);
1350 InsrHelper(&masm, z19.VnD(), src2_inputs_5);
1351 Initialise(&masm, p0.VnS(), mask_inputs_5);
1352
1353 __ Cmphi(p10.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1354 __ Mrs(x10, NZCV);
1355 __ Cmphs(p11.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1356 __ Mrs(x11, NZCV);
1357
1358 END();
1359
1360 if (CAN_RUN()) {
1361 RUN();
1362 int p2_expected[] = {1, 1, 1, 0, 1, 0, 0};
1363 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1364
1365 int p3_expected[] = {1, 1, 0, 0, 1, 0, 0};
1366 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1367
1368 int p4_expected[] = {0x1, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1369 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
1370
1371 int p5_expected[] = {0x0, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1372 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
1373
1374 int p6_expected[] = {0x1, 0x0, 0x0, 0x1};
1375 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
1376
1377 int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
1378 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
1379
1380 int p8_expected[] = {1, 0, 0, 0, 0, 0, 1, 1};
1381 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
1382
1383 int p9_expected[] = {1, 0, 1, 0, 0, 0, 1, 1};
1384 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
1385
1386 int p10_expected[] = {0x0, 0x0, 0x0, 0x0};
1387 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1388
1389 int p11_expected[] = {0x0, 0x1, 0x0, 0x1};
1390 ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1391
1392 ASSERT_EQUAL_32(NoFlag, w2);
1393 ASSERT_EQUAL_32(NoFlag, w3);
1394 ASSERT_EQUAL_32(NoFlag, w4);
1395 ASSERT_EQUAL_32(SVENotLastFlag, w5);
1396 ASSERT_EQUAL_32(SVEFirstFlag, w6);
1397 ASSERT_EQUAL_32(SVENotLastFlag, w7);
1398 ASSERT_EQUAL_32(SVEFirstFlag, w8);
1399 ASSERT_EQUAL_32(SVEFirstFlag, w9);
1400 ASSERT_EQUAL_32(SVENotLastFlag | SVENoneFlag, w10);
1401 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w11);
TatWai Chongf4fa8222019-06-17 12:08:14 -07001402 }
TatWai Chongf4fa8222019-06-17 12:08:14 -07001403}
1404
Jacob Bramleye8289202019-07-31 11:25:23 +01001405TEST_SVE(sve_bitwise_imm) {
1406 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chonga1885a52019-04-15 17:19:14 -07001407 START();
1408
1409 // clang-format off
1410 uint64_t z21_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
1411 uint32_t z22_inputs[] = {0xfedcba98, 0x76543210, 0x01234567, 0x89abcdef};
1412 uint16_t z23_inputs[] = {0xfedc, 0xba98, 0x7654, 0x3210,
1413 0x0123, 0x4567, 0x89ab, 0xcdef};
1414 uint8_t z24_inputs[] = {0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
1415 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
1416 // clang-format on
1417
1418 InsrHelper(&masm, z1.VnD(), z21_inputs);
1419 InsrHelper(&masm, z2.VnS(), z22_inputs);
1420 InsrHelper(&masm, z3.VnH(), z23_inputs);
1421 InsrHelper(&masm, z4.VnB(), z24_inputs);
1422
1423 __ And(z1.VnD(), z1.VnD(), 0x0000ffff0000ffff);
1424 __ And(z2.VnS(), z2.VnS(), 0xff0000ff);
1425 __ And(z3.VnH(), z3.VnH(), 0x0ff0);
1426 __ And(z4.VnB(), z4.VnB(), 0x3f);
1427
1428 InsrHelper(&masm, z5.VnD(), z21_inputs);
1429 InsrHelper(&masm, z6.VnS(), z22_inputs);
1430 InsrHelper(&masm, z7.VnH(), z23_inputs);
1431 InsrHelper(&masm, z8.VnB(), z24_inputs);
1432
1433 __ Eor(z5.VnD(), z5.VnD(), 0x0000ffff0000ffff);
1434 __ Eor(z6.VnS(), z6.VnS(), 0xff0000ff);
1435 __ Eor(z7.VnH(), z7.VnH(), 0x0ff0);
1436 __ Eor(z8.VnB(), z8.VnB(), 0x3f);
1437
1438 InsrHelper(&masm, z9.VnD(), z21_inputs);
1439 InsrHelper(&masm, z10.VnS(), z22_inputs);
1440 InsrHelper(&masm, z11.VnH(), z23_inputs);
1441 InsrHelper(&masm, z12.VnB(), z24_inputs);
1442
1443 __ Orr(z9.VnD(), z9.VnD(), 0x0000ffff0000ffff);
1444 __ Orr(z10.VnS(), z10.VnS(), 0xff0000ff);
1445 __ Orr(z11.VnH(), z11.VnH(), 0x0ff0);
1446 __ Orr(z12.VnB(), z12.VnB(), 0x3f);
1447
Jacob Bramley6069fd42019-06-24 10:20:45 +01001448 {
1449 // The `Dup` macro maps onto either `dup` or `dupm`, but has its own test,
1450 // so here we test `dupm` directly.
1451 ExactAssemblyScope guard(&masm, 4 * kInstructionSize);
1452 __ dupm(z13.VnD(), 0x7ffffff800000000);
1453 __ dupm(z14.VnS(), 0x7ffc7ffc);
1454 __ dupm(z15.VnH(), 0x3ffc);
1455 __ dupm(z16.VnB(), 0xc3);
1456 }
TatWai Chonga1885a52019-04-15 17:19:14 -07001457
1458 END();
1459
1460 if (CAN_RUN()) {
1461 RUN();
1462
1463 // clang-format off
1464 uint64_t z1_expected[] = {0x0000ba9800003210, 0x000045670000cdef};
1465 uint32_t z2_expected[] = {0xfe000098, 0x76000010, 0x01000067, 0x890000ef};
1466 uint16_t z3_expected[] = {0x0ed0, 0x0a90, 0x0650, 0x0210,
1467 0x0120, 0x0560, 0x09a0, 0x0de0};
1468 uint8_t z4_expected[] = {0x3e, 0x1c, 0x3a, 0x18, 0x36, 0x14, 0x32, 0x10,
1469 0x01, 0x23, 0x05, 0x27, 0x09, 0x2b, 0x0d, 0x2f};
1470
1471 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1472 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1473 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1474 ASSERT_EQUAL_SVE(z4_expected, z4.VnB());
1475
1476 uint64_t z5_expected[] = {0xfedc45677654cdef, 0x0123ba9889ab3210};
1477 uint32_t z6_expected[] = {0x01dcba67, 0x895432ef, 0xfe234598, 0x76abcd10};
1478 uint16_t z7_expected[] = {0xf12c, 0xb568, 0x79a4, 0x3de0,
1479 0x0ed3, 0x4a97, 0x865b, 0xc21f};
1480 uint8_t z8_expected[] = {0xc1, 0xe3, 0x85, 0xa7, 0x49, 0x6b, 0x0d, 0x2f,
1481 0x3e, 0x1c, 0x7a, 0x58, 0xb6, 0x94, 0xf2, 0xd0};
1482
1483 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1484 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1485 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1486 ASSERT_EQUAL_SVE(z8_expected, z8.VnB());
1487
1488 uint64_t z9_expected[] = {0xfedcffff7654ffff, 0x0123ffff89abffff};
1489 uint32_t z10_expected[] = {0xffdcbaff, 0xff5432ff, 0xff2345ff, 0xffabcdff};
1490 uint16_t z11_expected[] = {0xfffc, 0xbff8, 0x7ff4, 0x3ff0,
1491 0x0ff3, 0x4ff7, 0x8ffb, 0xcfff};
1492 uint8_t z12_expected[] = {0xff, 0xff, 0xbf, 0xbf, 0x7f, 0x7f, 0x3f, 0x3f,
1493 0x3f, 0x3f, 0x7f, 0x7f, 0xbf, 0xbf, 0xff, 0xff};
1494
1495 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1496 ASSERT_EQUAL_SVE(z10_expected, z10.VnS());
1497 ASSERT_EQUAL_SVE(z11_expected, z11.VnH());
1498 ASSERT_EQUAL_SVE(z12_expected, z12.VnB());
1499
1500 uint64_t z13_expected[] = {0x7ffffff800000000, 0x7ffffff800000000};
1501 uint32_t z14_expected[] = {0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc};
1502 uint16_t z15_expected[] = {0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc,
1503 0x3ffc, 0x3ffc, 0x3ffc ,0x3ffc};
1504 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
1505 ASSERT_EQUAL_SVE(z14_expected, z14.VnS());
1506 ASSERT_EQUAL_SVE(z15_expected, z15.VnH());
1507 // clang-format on
1508 }
TatWai Chonga1885a52019-04-15 17:19:14 -07001509}
1510
Jacob Bramleye8289202019-07-31 11:25:23 +01001511TEST_SVE(sve_dup_imm) {
Jacob Bramley6069fd42019-06-24 10:20:45 +01001512 // The `Dup` macro can generate `dup`, `dupm`, and it can synthesise
1513 // unencodable immediates.
1514
Jacob Bramleye8289202019-07-31 11:25:23 +01001515 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley6069fd42019-06-24 10:20:45 +01001516 START();
1517
1518 // Encodable with `dup` (shift 0).
1519 __ Dup(z0.VnD(), -1);
1520 __ Dup(z1.VnS(), 0x7f);
1521 __ Dup(z2.VnH(), -0x80);
1522 __ Dup(z3.VnB(), 42);
1523
1524 // Encodable with `dup` (shift 8).
TatWai Chong6995bfd2019-09-26 10:48:05 +01001525 __ Dup(z4.VnD(), -42 * 256);
1526 __ Dup(z5.VnS(), -0x8000);
1527 __ Dup(z6.VnH(), 0x7f00);
Jacob Bramley6069fd42019-06-24 10:20:45 +01001528 // B-sized lanes cannot take a shift of 8.
1529
1530 // Encodable with `dupm` (but not `dup`).
1531 __ Dup(z10.VnD(), 0x3fc);
1532 __ Dup(z11.VnS(), -516097); // 0xfff81fff, as a signed int.
1533 __ Dup(z12.VnH(), 0x0001);
1534 // All values that fit B-sized lanes are encodable with `dup`.
1535
1536 // Cases that require immediate synthesis.
1537 __ Dup(z20.VnD(), 0x1234);
1538 __ Dup(z21.VnD(), -4242);
1539 __ Dup(z22.VnD(), 0xfedcba9876543210);
1540 __ Dup(z23.VnS(), 0x01020304);
1541 __ Dup(z24.VnS(), -0x01020304);
1542 __ Dup(z25.VnH(), 0x3c38);
1543 // All values that fit B-sized lanes are directly encodable.
1544
1545 END();
1546
1547 if (CAN_RUN()) {
1548 RUN();
1549
1550 ASSERT_EQUAL_SVE(0xffffffffffffffff, z0.VnD());
1551 ASSERT_EQUAL_SVE(0x0000007f, z1.VnS());
1552 ASSERT_EQUAL_SVE(0xff80, z2.VnH());
1553 ASSERT_EQUAL_SVE(0x2a, z3.VnB());
1554
TatWai Chong6995bfd2019-09-26 10:48:05 +01001555 ASSERT_EQUAL_SVE(0xffffffffffffd600, z4.VnD());
1556 ASSERT_EQUAL_SVE(0xffff8000, z5.VnS());
1557 ASSERT_EQUAL_SVE(0x7f00, z6.VnH());
Jacob Bramley6069fd42019-06-24 10:20:45 +01001558
1559 ASSERT_EQUAL_SVE(0x00000000000003fc, z10.VnD());
1560 ASSERT_EQUAL_SVE(0xfff81fff, z11.VnS());
1561 ASSERT_EQUAL_SVE(0x0001, z12.VnH());
1562
1563 ASSERT_EQUAL_SVE(0x1234, z20.VnD());
1564 ASSERT_EQUAL_SVE(0xffffffffffffef6e, z21.VnD());
1565 ASSERT_EQUAL_SVE(0xfedcba9876543210, z22.VnD());
1566 ASSERT_EQUAL_SVE(0x01020304, z23.VnS());
1567 ASSERT_EQUAL_SVE(0xfefdfcfc, z24.VnS());
1568 ASSERT_EQUAL_SVE(0x3c38, z25.VnH());
1569 }
1570}
1571
Jacob Bramleye8289202019-07-31 11:25:23 +01001572TEST_SVE(sve_inc_dec_p_scalar) {
1573 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001574 START();
1575
1576 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1577 Initialise(&masm, p0.VnB(), p0_inputs);
1578
1579 int p0_b_count = 9;
1580 int p0_h_count = 5;
1581 int p0_s_count = 3;
1582 int p0_d_count = 2;
1583
1584 // 64-bit operations preserve their high bits.
1585 __ Mov(x0, 0x123456780000002a);
1586 __ Decp(x0, p0.VnB());
1587
1588 __ Mov(x1, 0x123456780000002a);
1589 __ Incp(x1, p0.VnH());
1590
1591 // Check that saturation does not occur.
1592 __ Mov(x10, 1);
1593 __ Decp(x10, p0.VnS());
1594
1595 __ Mov(x11, UINT64_MAX);
1596 __ Incp(x11, p0.VnD());
1597
1598 __ Mov(x12, INT64_MAX);
1599 __ Incp(x12, p0.VnB());
1600
1601 // With an all-true predicate, these instructions increment or decrement by
1602 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001603 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001604
1605 __ Mov(x20, 0x4000000000000000);
1606 __ Decp(x20, p15.VnB());
1607
1608 __ Mov(x21, 0x4000000000000000);
1609 __ Incp(x21, p15.VnH());
1610
1611 END();
1612 if (CAN_RUN()) {
1613 RUN();
1614
1615 ASSERT_EQUAL_64(0x123456780000002a - p0_b_count, x0);
1616 ASSERT_EQUAL_64(0x123456780000002a + p0_h_count, x1);
1617
1618 ASSERT_EQUAL_64(UINT64_C(1) - p0_s_count, x10);
1619 ASSERT_EQUAL_64(UINT64_MAX + p0_d_count, x11);
1620 ASSERT_EQUAL_64(static_cast<uint64_t>(INT64_MAX) + p0_b_count, x12);
1621
1622 ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1623 ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1624 }
1625}
1626
Jacob Bramleye8289202019-07-31 11:25:23 +01001627TEST_SVE(sve_sqinc_sqdec_p_scalar) {
1628 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001629 START();
1630
1631 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1632 Initialise(&masm, p0.VnB(), p0_inputs);
1633
1634 int p0_b_count = 9;
1635 int p0_h_count = 5;
1636 int p0_s_count = 3;
1637 int p0_d_count = 2;
1638
1639 uint64_t dummy_high = 0x1234567800000000;
1640
1641 // 64-bit operations preserve their high bits.
1642 __ Mov(x0, dummy_high + 42);
1643 __ Sqdecp(x0, p0.VnB());
1644
1645 __ Mov(x1, dummy_high + 42);
1646 __ Sqincp(x1, p0.VnH());
1647
1648 // 32-bit operations sign-extend into their high bits.
1649 __ Mov(x2, dummy_high + 42);
1650 __ Sqdecp(x2, p0.VnS(), w2);
1651
1652 __ Mov(x3, dummy_high + 42);
1653 __ Sqincp(x3, p0.VnD(), w3);
1654
1655 __ Mov(x4, dummy_high + 1);
1656 __ Sqdecp(x4, p0.VnS(), w4);
1657
1658 __ Mov(x5, dummy_high - 1);
1659 __ Sqincp(x5, p0.VnD(), w5);
1660
1661 // Check that saturation behaves correctly.
1662 __ Mov(x10, 0x8000000000000001); // INT64_MIN + 1
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001663 __ Sqdecp(x10, p0.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001664
1665 __ Mov(x11, dummy_high + 0x80000001); // INT32_MIN + 1
1666 __ Sqdecp(x11, p0.VnH(), w11);
1667
1668 __ Mov(x12, 1);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001669 __ Sqdecp(x12, p0.VnS());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001670
1671 __ Mov(x13, dummy_high + 1);
1672 __ Sqdecp(x13, p0.VnD(), w13);
1673
1674 __ Mov(x14, 0x7ffffffffffffffe); // INT64_MAX - 1
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001675 __ Sqincp(x14, p0.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001676
1677 __ Mov(x15, dummy_high + 0x7ffffffe); // INT32_MAX - 1
1678 __ Sqincp(x15, p0.VnH(), w15);
1679
1680 // Don't use x16 and x17 since they are scratch registers by default.
1681
1682 __ Mov(x18, 0xffffffffffffffff);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001683 __ Sqincp(x18, p0.VnS());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001684
1685 __ Mov(x19, dummy_high + 0xffffffff);
1686 __ Sqincp(x19, p0.VnD(), w19);
1687
1688 __ Mov(x20, dummy_high + 0xffffffff);
1689 __ Sqdecp(x20, p0.VnB(), w20);
1690
1691 // With an all-true predicate, these instructions increment or decrement by
1692 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001693 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001694
1695 __ Mov(x21, 0);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001696 __ Sqdecp(x21, p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001697
1698 __ Mov(x22, 0);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001699 __ Sqincp(x22, p15.VnH());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001700
1701 __ Mov(x23, dummy_high);
1702 __ Sqdecp(x23, p15.VnS(), w23);
1703
1704 __ Mov(x24, dummy_high);
1705 __ Sqincp(x24, p15.VnD(), w24);
1706
1707 END();
1708 if (CAN_RUN()) {
1709 RUN();
1710
1711 // 64-bit operations preserve their high bits.
1712 ASSERT_EQUAL_64(dummy_high + 42 - p0_b_count, x0);
1713 ASSERT_EQUAL_64(dummy_high + 42 + p0_h_count, x1);
1714
1715 // 32-bit operations sign-extend into their high bits.
1716 ASSERT_EQUAL_64(42 - p0_s_count, x2);
1717 ASSERT_EQUAL_64(42 + p0_d_count, x3);
1718 ASSERT_EQUAL_64(0xffffffff00000000 | (1 - p0_s_count), x4);
1719 ASSERT_EQUAL_64(p0_d_count - 1, x5);
1720
1721 // Check that saturation behaves correctly.
1722 ASSERT_EQUAL_64(INT64_MIN, x10);
1723 ASSERT_EQUAL_64(INT32_MIN, x11);
1724 ASSERT_EQUAL_64(1 - p0_s_count, x12);
1725 ASSERT_EQUAL_64(1 - p0_d_count, x13);
1726 ASSERT_EQUAL_64(INT64_MAX, x14);
1727 ASSERT_EQUAL_64(INT32_MAX, x15);
1728 ASSERT_EQUAL_64(p0_s_count - 1, x18);
1729 ASSERT_EQUAL_64(p0_d_count - 1, x19);
1730 ASSERT_EQUAL_64(-1 - p0_b_count, x20);
1731
1732 // Check all-true predicates.
1733 ASSERT_EQUAL_64(-core.GetSVELaneCount(kBRegSize), x21);
1734 ASSERT_EQUAL_64(core.GetSVELaneCount(kHRegSize), x22);
1735 ASSERT_EQUAL_64(-core.GetSVELaneCount(kSRegSize), x23);
1736 ASSERT_EQUAL_64(core.GetSVELaneCount(kDRegSize), x24);
1737 }
1738}
1739
Jacob Bramleye8289202019-07-31 11:25:23 +01001740TEST_SVE(sve_uqinc_uqdec_p_scalar) {
1741 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001742 START();
1743
1744 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1745 Initialise(&masm, p0.VnB(), p0_inputs);
1746
1747 int p0_b_count = 9;
1748 int p0_h_count = 5;
1749 int p0_s_count = 3;
1750 int p0_d_count = 2;
1751
1752 uint64_t dummy_high = 0x1234567800000000;
1753
1754 // 64-bit operations preserve their high bits.
1755 __ Mov(x0, dummy_high + 42);
1756 __ Uqdecp(x0, p0.VnB());
1757
1758 __ Mov(x1, dummy_high + 42);
1759 __ Uqincp(x1, p0.VnH());
1760
1761 // 32-bit operations zero-extend into their high bits.
1762 __ Mov(x2, dummy_high + 42);
1763 __ Uqdecp(x2, p0.VnS(), w2);
1764
1765 __ Mov(x3, dummy_high + 42);
1766 __ Uqincp(x3, p0.VnD(), w3);
1767
1768 __ Mov(x4, dummy_high + 0x80000001);
1769 __ Uqdecp(x4, p0.VnS(), w4);
1770
1771 __ Mov(x5, dummy_high + 0x7fffffff);
1772 __ Uqincp(x5, p0.VnD(), w5);
1773
1774 // Check that saturation behaves correctly.
1775 __ Mov(x10, 1);
1776 __ Uqdecp(x10, p0.VnB(), x10);
1777
1778 __ Mov(x11, dummy_high + 1);
1779 __ Uqdecp(x11, p0.VnH(), w11);
1780
1781 __ Mov(x12, 0x8000000000000000); // INT64_MAX + 1
1782 __ Uqdecp(x12, p0.VnS(), x12);
1783
1784 __ Mov(x13, dummy_high + 0x80000000); // INT32_MAX + 1
1785 __ Uqdecp(x13, p0.VnD(), w13);
1786
1787 __ Mov(x14, 0xfffffffffffffffe); // UINT64_MAX - 1
1788 __ Uqincp(x14, p0.VnB(), x14);
1789
1790 __ Mov(x15, dummy_high + 0xfffffffe); // UINT32_MAX - 1
1791 __ Uqincp(x15, p0.VnH(), w15);
1792
1793 // Don't use x16 and x17 since they are scratch registers by default.
1794
1795 __ Mov(x18, 0x7ffffffffffffffe); // INT64_MAX - 1
1796 __ Uqincp(x18, p0.VnS(), x18);
1797
1798 __ Mov(x19, dummy_high + 0x7ffffffe); // INT32_MAX - 1
1799 __ Uqincp(x19, p0.VnD(), w19);
1800
1801 // With an all-true predicate, these instructions increment or decrement by
1802 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001803 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001804
1805 __ Mov(x20, 0x4000000000000000);
1806 __ Uqdecp(x20, p15.VnB(), x20);
1807
1808 __ Mov(x21, 0x4000000000000000);
1809 __ Uqincp(x21, p15.VnH(), x21);
1810
1811 __ Mov(x22, dummy_high + 0x40000000);
1812 __ Uqdecp(x22, p15.VnS(), w22);
1813
1814 __ Mov(x23, dummy_high + 0x40000000);
1815 __ Uqincp(x23, p15.VnD(), w23);
1816
1817 END();
1818 if (CAN_RUN()) {
1819 RUN();
1820
1821 // 64-bit operations preserve their high bits.
1822 ASSERT_EQUAL_64(dummy_high + 42 - p0_b_count, x0);
1823 ASSERT_EQUAL_64(dummy_high + 42 + p0_h_count, x1);
1824
1825 // 32-bit operations zero-extend into their high bits.
1826 ASSERT_EQUAL_64(42 - p0_s_count, x2);
1827 ASSERT_EQUAL_64(42 + p0_d_count, x3);
1828 ASSERT_EQUAL_64(UINT64_C(0x80000001) - p0_s_count, x4);
1829 ASSERT_EQUAL_64(UINT64_C(0x7fffffff) + p0_d_count, x5);
1830
1831 // Check that saturation behaves correctly.
1832 ASSERT_EQUAL_64(0, x10);
1833 ASSERT_EQUAL_64(0, x11);
1834 ASSERT_EQUAL_64(0x8000000000000000 - p0_s_count, x12);
1835 ASSERT_EQUAL_64(UINT64_C(0x80000000) - p0_d_count, x13);
1836 ASSERT_EQUAL_64(UINT64_MAX, x14);
1837 ASSERT_EQUAL_64(UINT32_MAX, x15);
1838 ASSERT_EQUAL_64(0x7ffffffffffffffe + p0_s_count, x18);
1839 ASSERT_EQUAL_64(UINT64_C(0x7ffffffe) + p0_d_count, x19);
1840
1841 // Check all-true predicates.
1842 ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1843 ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1844 ASSERT_EQUAL_64(0x40000000 - core.GetSVELaneCount(kSRegSize), x22);
1845 ASSERT_EQUAL_64(0x40000000 + core.GetSVELaneCount(kDRegSize), x23);
1846 }
1847}
1848
Jacob Bramleye8289202019-07-31 11:25:23 +01001849TEST_SVE(sve_inc_dec_p_vector) {
1850 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001851 START();
1852
1853 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
1854 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1855 Initialise(&masm, p0.VnB(), p0_inputs);
1856
1857 // Check that saturation does not occur.
1858
1859 int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
1860 InsrHelper(&masm, z0.VnD(), z0_inputs);
1861
1862 int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
1863 InsrHelper(&masm, z1.VnD(), z1_inputs);
1864
1865 int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
1866 InsrHelper(&masm, z2.VnS(), z2_inputs);
1867
1868 int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
1869 InsrHelper(&masm, z3.VnH(), z3_inputs);
1870
1871 // The MacroAssembler implements non-destructive operations using movprfx.
1872 __ Decp(z10.VnD(), p0, z0.VnD());
1873 __ Decp(z11.VnD(), p0, z1.VnD());
1874 __ Decp(z12.VnS(), p0, z2.VnS());
1875 __ Decp(z13.VnH(), p0, z3.VnH());
1876
1877 __ Incp(z14.VnD(), p0, z0.VnD());
1878 __ Incp(z15.VnD(), p0, z1.VnD());
1879 __ Incp(z16.VnS(), p0, z2.VnS());
1880 __ Incp(z17.VnH(), p0, z3.VnH());
1881
1882 // Also test destructive forms.
1883 __ Mov(z4, z0);
1884 __ Mov(z5, z1);
1885 __ Mov(z6, z2);
1886 __ Mov(z7, z3);
1887
1888 __ Decp(z0.VnD(), p0);
1889 __ Decp(z1.VnD(), p0);
1890 __ Decp(z2.VnS(), p0);
1891 __ Decp(z3.VnH(), p0);
1892
1893 __ Incp(z4.VnD(), p0);
1894 __ Incp(z5.VnD(), p0);
1895 __ Incp(z6.VnS(), p0);
1896 __ Incp(z7.VnH(), p0);
1897
1898 END();
1899 if (CAN_RUN()) {
1900 RUN();
1901
1902 // z0_inputs[...] - number of active D lanes (2)
1903 int64_t z0_expected[] = {0x1234567800000040, -2, -1, 0x7ffffffffffffffe};
1904 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1905
1906 // z1_inputs[...] - number of active D lanes (2)
1907 int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
1908 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1909
1910 // z2_inputs[...] - number of active S lanes (3)
1911 int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, 0x7ffffffd};
1912 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1913
1914 // z3_inputs[...] - number of active H lanes (5)
1915 int16_t z3_expected[] = {0x1225, -5, -4, -6, 0x7ffb, 0x7ffa};
1916 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1917
1918 // z0_inputs[...] + number of active D lanes (2)
1919 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
1920 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1921
1922 // z1_inputs[...] + number of active D lanes (2)
1923 uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, 0x8000000000000001};
1924 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1925
1926 // z2_inputs[...] + number of active S lanes (3)
1927 uint32_t z6_expected[] = {0x12340045, 3, 2, 4, 0x80000002, 0x80000003};
1928 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1929
1930 // z3_inputs[...] + number of active H lanes (5)
1931 uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, 0x8004};
1932 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1933
1934 // Check that the non-destructive macros produced the same results.
1935 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
1936 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
1937 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
1938 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
1939 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
1940 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
1941 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
1942 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
1943 }
1944}
1945
Jacob Bramleye8289202019-07-31 11:25:23 +01001946TEST_SVE(sve_inc_dec_ptrue_vector) {
1947 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001948 START();
1949
1950 // With an all-true predicate, these instructions increment or decrement by
1951 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001952 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001953
1954 __ Dup(z0.VnD(), 0);
1955 __ Decp(z0.VnD(), p15);
1956
1957 __ Dup(z1.VnS(), 0);
1958 __ Decp(z1.VnS(), p15);
1959
1960 __ Dup(z2.VnH(), 0);
1961 __ Decp(z2.VnH(), p15);
1962
1963 __ Dup(z3.VnD(), 0);
1964 __ Incp(z3.VnD(), p15);
1965
1966 __ Dup(z4.VnS(), 0);
1967 __ Incp(z4.VnS(), p15);
1968
1969 __ Dup(z5.VnH(), 0);
1970 __ Incp(z5.VnH(), p15);
1971
1972 END();
1973 if (CAN_RUN()) {
1974 RUN();
1975
1976 int d_lane_count = core.GetSVELaneCount(kDRegSize);
1977 int s_lane_count = core.GetSVELaneCount(kSRegSize);
1978 int h_lane_count = core.GetSVELaneCount(kHRegSize);
1979
1980 for (int i = 0; i < d_lane_count; i++) {
1981 ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
1982 ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
1983 }
1984
1985 for (int i = 0; i < s_lane_count; i++) {
1986 ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
1987 ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
1988 }
1989
1990 for (int i = 0; i < h_lane_count; i++) {
1991 ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
1992 ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
1993 }
1994 }
1995}
1996
Jacob Bramleye8289202019-07-31 11:25:23 +01001997TEST_SVE(sve_sqinc_sqdec_p_vector) {
1998 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001999 START();
2000
2001 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
2002 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
2003 Initialise(&masm, p0.VnB(), p0_inputs);
2004
2005 // Check that saturation behaves correctly.
2006
2007 int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
2008 InsrHelper(&masm, z0.VnD(), z0_inputs);
2009
2010 int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
2011 InsrHelper(&masm, z1.VnD(), z1_inputs);
2012
2013 int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
2014 InsrHelper(&masm, z2.VnS(), z2_inputs);
2015
2016 int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
2017 InsrHelper(&masm, z3.VnH(), z3_inputs);
2018
2019 // The MacroAssembler implements non-destructive operations using movprfx.
2020 __ Sqdecp(z10.VnD(), p0, z0.VnD());
2021 __ Sqdecp(z11.VnD(), p0, z1.VnD());
2022 __ Sqdecp(z12.VnS(), p0, z2.VnS());
2023 __ Sqdecp(z13.VnH(), p0, z3.VnH());
2024
2025 __ Sqincp(z14.VnD(), p0, z0.VnD());
2026 __ Sqincp(z15.VnD(), p0, z1.VnD());
2027 __ Sqincp(z16.VnS(), p0, z2.VnS());
2028 __ Sqincp(z17.VnH(), p0, z3.VnH());
2029
2030 // Also test destructive forms.
2031 __ Mov(z4, z0);
2032 __ Mov(z5, z1);
2033 __ Mov(z6, z2);
2034 __ Mov(z7, z3);
2035
2036 __ Sqdecp(z0.VnD(), p0);
2037 __ Sqdecp(z1.VnD(), p0);
2038 __ Sqdecp(z2.VnS(), p0);
2039 __ Sqdecp(z3.VnH(), p0);
2040
2041 __ Sqincp(z4.VnD(), p0);
2042 __ Sqincp(z5.VnD(), p0);
2043 __ Sqincp(z6.VnS(), p0);
2044 __ Sqincp(z7.VnH(), p0);
2045
2046 END();
2047 if (CAN_RUN()) {
2048 RUN();
2049
2050 // z0_inputs[...] - number of active D lanes (2)
2051 int64_t z0_expected[] = {0x1234567800000040, -2, -1, INT64_MIN};
2052 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
2053
2054 // z1_inputs[...] - number of active D lanes (2)
2055 int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
2056 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
2057
2058 // z2_inputs[...] - number of active S lanes (3)
2059 int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, INT32_MIN};
2060 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
2061
2062 // z3_inputs[...] - number of active H lanes (5)
2063 int16_t z3_expected[] = {0x1225, -5, -4, -6, INT16_MIN, 0x7ffa};
2064 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
2065
2066 // z0_inputs[...] + number of active D lanes (2)
2067 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2068 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2069
2070 // z1_inputs[...] + number of active D lanes (2)
2071 uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, INT64_MAX};
2072 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2073
2074 // z2_inputs[...] + number of active S lanes (3)
2075 uint32_t z6_expected[] = {0x12340045, 3, 2, 4, INT32_MAX, 0x80000003};
2076 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2077
2078 // z3_inputs[...] + number of active H lanes (5)
2079 uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, INT16_MAX};
2080 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2081
2082 // Check that the non-destructive macros produced the same results.
2083 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2084 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2085 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2086 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2087 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2088 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2089 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2090 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2091 }
2092}
2093
Jacob Bramleye8289202019-07-31 11:25:23 +01002094TEST_SVE(sve_sqinc_sqdec_ptrue_vector) {
2095 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002096 START();
2097
2098 // With an all-true predicate, these instructions increment or decrement by
2099 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01002100 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002101
2102 __ Dup(z0.VnD(), 0);
2103 __ Sqdecp(z0.VnD(), p15);
2104
2105 __ Dup(z1.VnS(), 0);
2106 __ Sqdecp(z1.VnS(), p15);
2107
2108 __ Dup(z2.VnH(), 0);
2109 __ Sqdecp(z2.VnH(), p15);
2110
2111 __ Dup(z3.VnD(), 0);
2112 __ Sqincp(z3.VnD(), p15);
2113
2114 __ Dup(z4.VnS(), 0);
2115 __ Sqincp(z4.VnS(), p15);
2116
2117 __ Dup(z5.VnH(), 0);
2118 __ Sqincp(z5.VnH(), p15);
2119
2120 END();
2121 if (CAN_RUN()) {
2122 RUN();
2123
2124 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2125 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2126 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2127
2128 for (int i = 0; i < d_lane_count; i++) {
2129 ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
2130 ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
2131 }
2132
2133 for (int i = 0; i < s_lane_count; i++) {
2134 ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
2135 ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
2136 }
2137
2138 for (int i = 0; i < h_lane_count; i++) {
2139 ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
2140 ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
2141 }
2142 }
2143}
2144
Jacob Bramleye8289202019-07-31 11:25:23 +01002145TEST_SVE(sve_uqinc_uqdec_p_vector) {
2146 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002147 START();
2148
2149 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
2150 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
2151 Initialise(&masm, p0.VnB(), p0_inputs);
2152
2153 // Check that saturation behaves correctly.
2154
2155 uint64_t z0_inputs[] = {0x1234567800000042, 0, 1, 0x8000000000000000};
2156 InsrHelper(&masm, z0.VnD(), z0_inputs);
2157
2158 uint64_t z1_inputs[] = {0x12345678ffffff2a, 0, UINT64_MAX, INT64_MAX};
2159 InsrHelper(&masm, z1.VnD(), z1_inputs);
2160
2161 uint32_t z2_inputs[] = {0x12340042, 0, UINT32_MAX, 1, INT32_MAX, 0x80000000};
2162 InsrHelper(&masm, z2.VnS(), z2_inputs);
2163
2164 uint16_t z3_inputs[] = {0x122a, 0, 1, UINT16_MAX, 0x8000, INT16_MAX};
2165 InsrHelper(&masm, z3.VnH(), z3_inputs);
2166
2167 // The MacroAssembler implements non-destructive operations using movprfx.
2168 __ Uqdecp(z10.VnD(), p0, z0.VnD());
2169 __ Uqdecp(z11.VnD(), p0, z1.VnD());
2170 __ Uqdecp(z12.VnS(), p0, z2.VnS());
2171 __ Uqdecp(z13.VnH(), p0, z3.VnH());
2172
2173 __ Uqincp(z14.VnD(), p0, z0.VnD());
2174 __ Uqincp(z15.VnD(), p0, z1.VnD());
2175 __ Uqincp(z16.VnS(), p0, z2.VnS());
2176 __ Uqincp(z17.VnH(), p0, z3.VnH());
2177
2178 // Also test destructive forms.
2179 __ Mov(z4, z0);
2180 __ Mov(z5, z1);
2181 __ Mov(z6, z2);
2182 __ Mov(z7, z3);
2183
2184 __ Uqdecp(z0.VnD(), p0);
2185 __ Uqdecp(z1.VnD(), p0);
2186 __ Uqdecp(z2.VnS(), p0);
2187 __ Uqdecp(z3.VnH(), p0);
2188
2189 __ Uqincp(z4.VnD(), p0);
2190 __ Uqincp(z5.VnD(), p0);
2191 __ Uqincp(z6.VnS(), p0);
2192 __ Uqincp(z7.VnH(), p0);
2193
2194 END();
2195 if (CAN_RUN()) {
2196 RUN();
2197
2198 // z0_inputs[...] - number of active D lanes (2)
2199 uint64_t z0_expected[] = {0x1234567800000040, 0, 0, 0x7ffffffffffffffe};
2200 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
2201
2202 // z1_inputs[...] - number of active D lanes (2)
2203 uint64_t z1_expected[] = {0x12345678ffffff28,
2204 0,
2205 0xfffffffffffffffd,
2206 0x7ffffffffffffffd};
2207 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
2208
2209 // z2_inputs[...] - number of active S lanes (3)
2210 uint32_t z2_expected[] =
2211 {0x1234003f, 0, 0xfffffffc, 0, 0x7ffffffc, 0x7ffffffd};
2212 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
2213
2214 // z3_inputs[...] - number of active H lanes (5)
2215 uint16_t z3_expected[] = {0x1225, 0, 0, 0xfffa, 0x7ffb, 0x7ffa};
2216 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
2217
2218 // z0_inputs[...] + number of active D lanes (2)
2219 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2220 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2221
2222 // z1_inputs[...] + number of active D lanes (2)
2223 uint64_t z5_expected[] = {0x12345678ffffff2c,
2224 2,
2225 UINT64_MAX,
2226 0x8000000000000001};
2227 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2228
2229 // z2_inputs[...] + number of active S lanes (3)
2230 uint32_t z6_expected[] =
2231 {0x12340045, 3, UINT32_MAX, 4, 0x80000002, 0x80000003};
2232 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2233
2234 // z3_inputs[...] + number of active H lanes (5)
2235 uint16_t z7_expected[] = {0x122f, 5, 6, UINT16_MAX, 0x8005, 0x8004};
2236 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2237
2238 // Check that the non-destructive macros produced the same results.
2239 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2240 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2241 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2242 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2243 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2244 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2245 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2246 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2247 }
2248}
2249
Jacob Bramleye8289202019-07-31 11:25:23 +01002250TEST_SVE(sve_uqinc_uqdec_ptrue_vector) {
2251 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002252 START();
2253
2254 // With an all-true predicate, these instructions increment or decrement by
2255 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01002256 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002257
2258 __ Mov(x0, 0x1234567800000000);
2259 __ Mov(x1, 0x12340000);
2260 __ Mov(x2, 0x1200);
2261
2262 __ Dup(z0.VnD(), x0);
2263 __ Uqdecp(z0.VnD(), p15);
2264
2265 __ Dup(z1.VnS(), x1);
2266 __ Uqdecp(z1.VnS(), p15);
2267
2268 __ Dup(z2.VnH(), x2);
2269 __ Uqdecp(z2.VnH(), p15);
2270
2271 __ Dup(z3.VnD(), x0);
2272 __ Uqincp(z3.VnD(), p15);
2273
2274 __ Dup(z4.VnS(), x1);
2275 __ Uqincp(z4.VnS(), p15);
2276
2277 __ Dup(z5.VnH(), x2);
2278 __ Uqincp(z5.VnH(), p15);
2279
2280 END();
2281 if (CAN_RUN()) {
2282 RUN();
2283
2284 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2285 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2286 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2287
2288 for (int i = 0; i < d_lane_count; i++) {
2289 ASSERT_EQUAL_SVE_LANE(0x1234567800000000 - d_lane_count, z0.VnD(), i);
2290 ASSERT_EQUAL_SVE_LANE(0x1234567800000000 + d_lane_count, z3.VnD(), i);
2291 }
2292
2293 for (int i = 0; i < s_lane_count; i++) {
2294 ASSERT_EQUAL_SVE_LANE(0x12340000 - s_lane_count, z1.VnS(), i);
2295 ASSERT_EQUAL_SVE_LANE(0x12340000 + s_lane_count, z4.VnS(), i);
2296 }
2297
2298 for (int i = 0; i < h_lane_count; i++) {
2299 ASSERT_EQUAL_SVE_LANE(0x1200 - h_lane_count, z2.VnH(), i);
2300 ASSERT_EQUAL_SVE_LANE(0x1200 + h_lane_count, z5.VnH(), i);
2301 }
2302 }
2303}
2304
Jacob Bramleye8289202019-07-31 11:25:23 +01002305TEST_SVE(sve_index) {
2306 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleycd8148c2019-07-11 18:43:20 +01002307 START();
2308
2309 // Simple cases.
2310 __ Index(z0.VnB(), 0, 1);
2311 __ Index(z1.VnH(), 1, 1);
2312 __ Index(z2.VnS(), 2, 1);
2313 __ Index(z3.VnD(), 3, 1);
2314
2315 // Synthesised immediates.
2316 __ Index(z4.VnB(), 42, -1);
2317 __ Index(z5.VnH(), -1, 42);
2318 __ Index(z6.VnS(), 42, 42);
2319
2320 // Register arguments.
2321 __ Mov(x0, 42);
2322 __ Mov(x1, -3);
2323 __ Index(z10.VnD(), x0, x1);
2324 __ Index(z11.VnB(), w0, w1);
2325 // The register size should correspond to the lane size, but VIXL allows any
2326 // register at least as big as the lane size.
2327 __ Index(z12.VnB(), x0, x1);
2328 __ Index(z13.VnH(), w0, x1);
2329 __ Index(z14.VnS(), x0, w1);
2330
2331 // Integer overflow.
2332 __ Index(z20.VnB(), UINT8_MAX - 2, 2);
2333 __ Index(z21.VnH(), 7, -3);
2334 __ Index(z22.VnS(), INT32_MAX - 2, 1);
2335 __ Index(z23.VnD(), INT64_MIN + 6, -7);
2336
2337 END();
2338
2339 if (CAN_RUN()) {
2340 RUN();
2341
2342 int b_lane_count = core.GetSVELaneCount(kBRegSize);
2343 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2344 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2345 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2346
2347 uint64_t b_mask = GetUintMask(kBRegSize);
2348 uint64_t h_mask = GetUintMask(kHRegSize);
2349 uint64_t s_mask = GetUintMask(kSRegSize);
2350 uint64_t d_mask = GetUintMask(kDRegSize);
2351
2352 // Simple cases.
2353 for (int i = 0; i < b_lane_count; i++) {
2354 ASSERT_EQUAL_SVE_LANE((0 + i) & b_mask, z0.VnB(), i);
2355 }
2356 for (int i = 0; i < h_lane_count; i++) {
2357 ASSERT_EQUAL_SVE_LANE((1 + i) & h_mask, z1.VnH(), i);
2358 }
2359 for (int i = 0; i < s_lane_count; i++) {
2360 ASSERT_EQUAL_SVE_LANE((2 + i) & s_mask, z2.VnS(), i);
2361 }
2362 for (int i = 0; i < d_lane_count; i++) {
2363 ASSERT_EQUAL_SVE_LANE((3 + i) & d_mask, z3.VnD(), i);
2364 }
2365
2366 // Synthesised immediates.
2367 for (int i = 0; i < b_lane_count; i++) {
2368 ASSERT_EQUAL_SVE_LANE((42 - i) & b_mask, z4.VnB(), i);
2369 }
2370 for (int i = 0; i < h_lane_count; i++) {
2371 ASSERT_EQUAL_SVE_LANE((-1 + (42 * i)) & h_mask, z5.VnH(), i);
2372 }
2373 for (int i = 0; i < s_lane_count; i++) {
2374 ASSERT_EQUAL_SVE_LANE((42 + (42 * i)) & s_mask, z6.VnS(), i);
2375 }
2376
2377 // Register arguments.
2378 for (int i = 0; i < d_lane_count; i++) {
2379 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & d_mask, z10.VnD(), i);
2380 }
2381 for (int i = 0; i < b_lane_count; i++) {
2382 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z11.VnB(), i);
2383 }
2384 for (int i = 0; i < b_lane_count; i++) {
2385 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z12.VnB(), i);
2386 }
2387 for (int i = 0; i < h_lane_count; i++) {
2388 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & h_mask, z13.VnH(), i);
2389 }
2390 for (int i = 0; i < s_lane_count; i++) {
2391 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & s_mask, z14.VnS(), i);
2392 }
2393
2394 // Integer overflow.
2395 uint8_t expected_z20[] = {0x05, 0x03, 0x01, 0xff, 0xfd};
2396 ASSERT_EQUAL_SVE(expected_z20, z20.VnB());
2397 uint16_t expected_z21[] = {0xfffb, 0xfffe, 0x0001, 0x0004, 0x0007};
2398 ASSERT_EQUAL_SVE(expected_z21, z21.VnH());
2399 uint32_t expected_z22[] = {0x80000000, 0x7fffffff, 0x7ffffffe, 0x7ffffffd};
2400 ASSERT_EQUAL_SVE(expected_z22, z22.VnS());
2401 uint64_t expected_z23[] = {0x7fffffffffffffff, 0x8000000000000006};
2402 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
2403 }
2404}
2405
TatWai Chongc844bb22019-06-10 15:32:53 -07002406TEST(sve_int_compare_count_and_limit_scalars) {
2407 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2408 START();
2409
2410 __ Mov(w20, 0xfffffffd);
2411 __ Mov(w21, 0xffffffff);
2412
2413 __ Whilele(p0.VnB(), w20, w21);
2414 __ Mrs(x0, NZCV);
2415 __ Whilele(p1.VnH(), w20, w21);
2416 __ Mrs(x1, NZCV);
2417
2418 __ Mov(w20, 0xffffffff);
2419 __ Mov(w21, 0x00000000);
2420
2421 __ Whilelt(p2.VnS(), w20, w21);
2422 __ Mrs(x2, NZCV);
2423 __ Whilelt(p3.VnD(), w20, w21);
2424 __ Mrs(x3, NZCV);
2425
2426 __ Mov(w20, 0xfffffffd);
2427 __ Mov(w21, 0xffffffff);
2428
2429 __ Whilels(p4.VnB(), w20, w21);
2430 __ Mrs(x4, NZCV);
2431 __ Whilels(p5.VnH(), w20, w21);
2432 __ Mrs(x5, NZCV);
2433
2434 __ Mov(w20, 0xffffffff);
2435 __ Mov(w21, 0x00000000);
2436
2437 __ Whilelo(p6.VnS(), w20, w21);
2438 __ Mrs(x6, NZCV);
2439 __ Whilelo(p7.VnD(), w20, w21);
2440 __ Mrs(x7, NZCV);
2441
2442 __ Mov(x20, 0xfffffffffffffffd);
2443 __ Mov(x21, 0xffffffffffffffff);
2444
2445 __ Whilele(p8.VnB(), x20, x21);
2446 __ Mrs(x8, NZCV);
2447 __ Whilele(p9.VnH(), x20, x21);
2448 __ Mrs(x9, NZCV);
2449
2450 __ Mov(x20, 0xffffffffffffffff);
2451 __ Mov(x21, 0x0000000000000000);
2452
2453 __ Whilelt(p10.VnS(), x20, x21);
2454 __ Mrs(x10, NZCV);
2455 __ Whilelt(p11.VnD(), x20, x21);
2456 __ Mrs(x11, NZCV);
2457
2458 __ Mov(x20, 0xfffffffffffffffd);
2459 __ Mov(x21, 0xffffffffffffffff);
2460
2461 __ Whilels(p12.VnB(), x20, x21);
2462 __ Mrs(x12, NZCV);
2463 __ Whilels(p13.VnH(), x20, x21);
2464 __ Mrs(x13, NZCV);
2465
2466 __ Mov(x20, 0xffffffffffffffff);
2467 __ Mov(x21, 0x0000000000000000);
2468
2469 __ Whilelo(p14.VnS(), x20, x21);
2470 __ Mrs(x14, NZCV);
2471 __ Whilelo(p15.VnD(), x20, x21);
2472 __ Mrs(x15, NZCV);
2473
2474 END();
2475
2476 if (CAN_RUN()) {
2477 RUN();
2478
2479 // 0b...00000000'00000111
2480 int p0_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2481 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
2482
2483 // 0b...00000000'00010101
2484 int p1_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2485 ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
2486
2487 int p2_expected[] = {0x0, 0x0, 0x0, 0x1};
2488 ASSERT_EQUAL_SVE(p2_expected, p2.VnS());
2489
2490 int p3_expected[] = {0x00, 0x01};
2491 ASSERT_EQUAL_SVE(p3_expected, p3.VnD());
2492
2493 // 0b...11111111'11111111
2494 int p4_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2495 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
2496
2497 // 0b...01010101'01010101
2498 int p5_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2499 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2500
2501 int p6_expected[] = {0x0, 0x0, 0x0, 0x0};
2502 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2503
2504 int p7_expected[] = {0x00, 0x00};
2505 ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
2506
2507 // 0b...00000000'00000111
2508 int p8_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2509 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
2510
2511 // 0b...00000000'00010101
2512 int p9_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2513 ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
2514
2515 int p10_expected[] = {0x0, 0x0, 0x0, 0x1};
2516 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
2517
2518 int p11_expected[] = {0x00, 0x01};
2519 ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2520
2521 // 0b...11111111'11111111
2522 int p12_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2523 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
2524
2525 // 0b...01010101'01010101
2526 int p13_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2527 ASSERT_EQUAL_SVE(p13_expected, p13.VnH());
2528
2529 int p14_expected[] = {0x0, 0x0, 0x0, 0x0};
2530 ASSERT_EQUAL_SVE(p14_expected, p14.VnS());
2531
2532 int p15_expected[] = {0x00, 0x00};
2533 ASSERT_EQUAL_SVE(p15_expected, p15.VnD());
2534
2535 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w0);
2536 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w1);
2537 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w2);
2538 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w3);
2539 ASSERT_EQUAL_32(SVEFirstFlag, w4);
2540 ASSERT_EQUAL_32(SVEFirstFlag, w5);
2541 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w6);
2542 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w7);
2543 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w8);
2544 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w9);
2545 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
2546 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w11);
2547 ASSERT_EQUAL_32(SVEFirstFlag, w12);
2548 ASSERT_EQUAL_32(SVEFirstFlag, w13);
2549 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w14);
2550 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w15);
2551 }
2552}
2553
TatWai Chong302729c2019-06-14 16:18:51 -07002554TEST(sve_int_compare_vectors_signed_imm) {
2555 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2556 START();
2557
2558 int z13_inputs[] = {0, 1, -1, -15, 126, -127, -126, -15};
2559 int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 1, 1};
2560 InsrHelper(&masm, z13.VnB(), z13_inputs);
2561 Initialise(&masm, p0.VnB(), mask_inputs1);
2562
2563 __ Cmpeq(p2.VnB(), p0.Zeroing(), z13.VnB(), -15);
2564 __ Mrs(x2, NZCV);
2565 __ Cmpeq(p3.VnB(), p0.Zeroing(), z13.VnB(), -127);
2566
2567 int z14_inputs[] = {0, 1, -1, -32767, -32766, 32767, 32766, 0};
2568 int mask_inputs2[] = {1, 1, 1, 0, 1, 1, 1, 1};
2569 InsrHelper(&masm, z14.VnH(), z14_inputs);
2570 Initialise(&masm, p0.VnH(), mask_inputs2);
2571
2572 __ Cmpge(p4.VnH(), p0.Zeroing(), z14.VnH(), -1);
2573 __ Mrs(x4, NZCV);
2574 __ Cmpge(p5.VnH(), p0.Zeroing(), z14.VnH(), -32767);
2575
2576 int z15_inputs[] = {0, 1, -1, INT_MIN};
2577 int mask_inputs3[] = {0, 1, 1, 1};
2578 InsrHelper(&masm, z15.VnS(), z15_inputs);
2579 Initialise(&masm, p0.VnS(), mask_inputs3);
2580
2581 __ Cmpgt(p6.VnS(), p0.Zeroing(), z15.VnS(), 0);
2582 __ Mrs(x6, NZCV);
2583 __ Cmpgt(p7.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2584
2585 __ Cmplt(p8.VnS(), p0.Zeroing(), z15.VnS(), 0);
2586 __ Mrs(x8, NZCV);
2587 __ Cmplt(p9.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2588
2589 int64_t z16_inputs[] = {0, -1};
2590 int mask_inputs4[] = {1, 1};
2591 InsrHelper(&masm, z16.VnD(), z16_inputs);
2592 Initialise(&masm, p0.VnD(), mask_inputs4);
2593
2594 __ Cmple(p10.VnD(), p0.Zeroing(), z16.VnD(), -1);
2595 __ Mrs(x10, NZCV);
2596 __ Cmple(p11.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MIN);
2597
2598 __ Cmpne(p12.VnD(), p0.Zeroing(), z16.VnD(), -1);
2599 __ Mrs(x12, NZCV);
2600 __ Cmpne(p13.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MAX);
2601
2602 END();
2603
2604 if (CAN_RUN()) {
2605 RUN();
2606
2607 int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1};
2608 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2609
2610 int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 0};
2611 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2612
2613 int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1, 0x1};
2614 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2615
2616 int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1};
2617 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2618
2619 int p6_expected[] = {0x0, 0x1, 0x0, 0x0};
2620 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2621
2622 int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
2623 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2624
2625 int p8_expected[] = {0x0, 0x0, 0x1, 0x1};
2626 ASSERT_EQUAL_SVE(p8_expected, p8.VnS());
2627
2628 int p9_expected[] = {0x0, 0x0, 0x0, 0x1};
2629 ASSERT_EQUAL_SVE(p9_expected, p9.VnS());
2630
2631 int p10_expected[] = {0x00, 0x01};
2632 ASSERT_EQUAL_SVE(p10_expected, p10.VnD());
2633
2634 int p11_expected[] = {0x00, 0x00};
2635 ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2636
2637 int p12_expected[] = {0x01, 0x00};
2638 ASSERT_EQUAL_SVE(p12_expected, p12.VnD());
2639
2640 int p13_expected[] = {0x01, 0x01};
2641 ASSERT_EQUAL_SVE(p13_expected, p13.VnD());
2642
2643 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w2);
2644 ASSERT_EQUAL_32(SVEFirstFlag, w4);
2645 ASSERT_EQUAL_32(NoFlag, w6);
2646 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2647 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w10);
2648 ASSERT_EQUAL_32(NoFlag, w12);
2649 }
2650}
2651
2652TEST(sve_int_compare_vectors_unsigned_imm) {
2653 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2654 START();
2655
2656 uint32_t src1_inputs[] = {0xf7, 0x0f, 0x8f, 0x1f, 0x83, 0x12, 0x00, 0xf1};
2657 int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 0, 1};
2658 InsrHelper(&masm, z13.VnB(), src1_inputs);
2659 Initialise(&masm, p0.VnB(), mask_inputs1);
2660
2661 __ Cmphi(p2.VnB(), p0.Zeroing(), z13.VnB(), 0x0f);
2662 __ Mrs(x2, NZCV);
2663 __ Cmphi(p3.VnB(), p0.Zeroing(), z13.VnB(), 0xf0);
2664
2665 uint32_t src2_inputs[] = {0xffff, 0x8000, 0x1fff, 0x0000, 0x1234};
2666 int mask_inputs2[] = {1, 1, 1, 1, 0};
2667 InsrHelper(&masm, z13.VnH(), src2_inputs);
2668 Initialise(&masm, p0.VnH(), mask_inputs2);
2669
2670 __ Cmphs(p4.VnH(), p0.Zeroing(), z13.VnH(), 0x1f);
2671 __ Mrs(x4, NZCV);
2672 __ Cmphs(p5.VnH(), p0.Zeroing(), z13.VnH(), 0x1fff);
2673
2674 uint32_t src3_inputs[] = {0xffffffff, 0xfedcba98, 0x0000ffff, 0x00000000};
2675 int mask_inputs3[] = {1, 1, 1, 1};
2676 InsrHelper(&masm, z13.VnS(), src3_inputs);
2677 Initialise(&masm, p0.VnS(), mask_inputs3);
2678
2679 __ Cmplo(p6.VnS(), p0.Zeroing(), z13.VnS(), 0x3f);
2680 __ Mrs(x6, NZCV);
2681 __ Cmplo(p7.VnS(), p0.Zeroing(), z13.VnS(), 0x3f3f3f3f);
2682
2683 uint64_t src4_inputs[] = {0xffffffffffffffff, 0x0000000000000000};
2684 int mask_inputs4[] = {1, 1};
2685 InsrHelper(&masm, z13.VnD(), src4_inputs);
2686 Initialise(&masm, p0.VnD(), mask_inputs4);
2687
2688 __ Cmpls(p8.VnD(), p0.Zeroing(), z13.VnD(), 0x2f);
2689 __ Mrs(x8, NZCV);
2690 __ Cmpls(p9.VnD(), p0.Zeroing(), z13.VnD(), 0x800000000000000);
2691
2692 END();
2693
2694 if (CAN_RUN()) {
2695 RUN();
2696
2697 int p2_expected[] = {1, 0, 1, 0, 1, 1, 0, 1};
2698 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2699
2700 int p3_expected[] = {1, 0, 0, 0, 0, 0, 0, 1};
2701 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2702
2703 int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2704 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2705
2706 int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2707 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2708
2709 int p6_expected[] = {0x0, 0x0, 0x0, 0x1};
2710 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2711
2712 int p7_expected[] = {0x0, 0x0, 0x1, 0x1};
2713 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2714
2715 int p8_expected[] = {0x00, 0x01};
2716 ASSERT_EQUAL_SVE(p8_expected, p8.VnD());
2717
2718 int p9_expected[] = {0x00, 0x01};
2719 ASSERT_EQUAL_SVE(p9_expected, p9.VnD());
2720
2721 ASSERT_EQUAL_32(SVEFirstFlag, w2);
2722 ASSERT_EQUAL_32(NoFlag, w4);
2723 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w6);
2724 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2725 }
2726}
2727
TatWai Chongc844bb22019-06-10 15:32:53 -07002728TEST(sve_int_compare_conditionally_terminate_scalars) {
2729 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2730 START();
2731
2732 __ Mov(x0, 0xfedcba9887654321);
2733 __ Mov(x1, 0x1000100010001000);
2734
Jacob Bramleyb40aa692019-10-07 19:24:29 +01002735 // Initialise Z and C. These are preserved by cterm*, and the V flag is set to
2736 // !C if the condition does not hold.
2737 __ Mov(x10, NoFlag);
2738 __ Msr(NZCV, x10);
2739
TatWai Chongc844bb22019-06-10 15:32:53 -07002740 __ Ctermeq(w0, w0);
2741 __ Mrs(x2, NZCV);
2742 __ Ctermeq(x0, x1);
2743 __ Mrs(x3, NZCV);
2744 __ Ctermne(x0, x0);
2745 __ Mrs(x4, NZCV);
2746 __ Ctermne(w0, w1);
2747 __ Mrs(x5, NZCV);
2748
Jacob Bramleyb40aa692019-10-07 19:24:29 +01002749 // As above, but with all flags initially set.
2750 __ Mov(x10, NZCVFlag);
2751 __ Msr(NZCV, x10);
2752
2753 __ Ctermeq(w0, w0);
2754 __ Mrs(x6, NZCV);
2755 __ Ctermeq(x0, x1);
2756 __ Mrs(x7, NZCV);
2757 __ Ctermne(x0, x0);
2758 __ Mrs(x8, NZCV);
2759 __ Ctermne(w0, w1);
2760 __ Mrs(x9, NZCV);
2761
TatWai Chongc844bb22019-06-10 15:32:53 -07002762 END();
2763
2764 if (CAN_RUN()) {
2765 RUN();
2766
2767 ASSERT_EQUAL_32(SVEFirstFlag, w2);
2768 ASSERT_EQUAL_32(VFlag, w3);
2769 ASSERT_EQUAL_32(VFlag, w4);
2770 ASSERT_EQUAL_32(SVEFirstFlag, w5);
Jacob Bramleyb40aa692019-10-07 19:24:29 +01002771
2772 ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w6);
2773 ASSERT_EQUAL_32(ZCFlag, w7);
2774 ASSERT_EQUAL_32(ZCFlag, w8);
2775 ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w9);
TatWai Chongc844bb22019-06-10 15:32:53 -07002776 }
2777}
2778
Jacob Bramley0ce75842019-07-17 18:12:50 +01002779// Work out what the architectural `PredTest` pseudocode should produce for the
2780// given result and governing predicate.
2781template <typename Tg, typename Td, int N>
2782static StatusFlags GetPredTestFlags(const Td (&pd)[N],
2783 const Tg (&pg)[N],
2784 int vl) {
2785 int first = -1;
2786 int last = -1;
2787 bool any_active = false;
2788
2789 // Only consider potentially-active lanes.
2790 int start = (N > vl) ? (N - vl) : 0;
2791 for (int i = start; i < N; i++) {
2792 if ((pg[i] & 1) == 1) {
2793 // Look for the first and last active lanes.
2794 // Note that the 'first' lane is the one with the highest index.
2795 if (last < 0) last = i;
2796 first = i;
2797 // Look for any active lanes that are also active in pd.
2798 if ((pd[i] & 1) == 1) any_active = true;
2799 }
2800 }
2801
2802 uint32_t flags = 0;
2803 if ((first >= 0) && ((pd[first] & 1) == 1)) flags |= SVEFirstFlag;
2804 if (!any_active) flags |= SVENoneFlag;
2805 if ((last < 0) || ((pd[last] & 1) == 0)) flags |= SVENotLastFlag;
2806 return static_cast<StatusFlags>(flags);
2807}
2808
2809typedef void (MacroAssembler::*PfirstPnextFn)(const PRegisterWithLaneSize& pd,
2810 const PRegister& pg,
2811 const PRegisterWithLaneSize& pn);
2812template <typename Tg, typename Tn, typename Td>
Jacob Bramleye8289202019-07-31 11:25:23 +01002813static void PfirstPnextHelper(Test* config,
2814 PfirstPnextFn macro,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002815 unsigned lane_size_in_bits,
2816 const Tg& pg_inputs,
2817 const Tn& pn_inputs,
2818 const Td& pd_expected) {
Jacob Bramleye8289202019-07-31 11:25:23 +01002819 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002820 START();
2821
2822 PRegister pg = p15;
2823 PRegister pn = p14;
2824 Initialise(&masm, pg.WithLaneSize(lane_size_in_bits), pg_inputs);
2825 Initialise(&masm, pn.WithLaneSize(lane_size_in_bits), pn_inputs);
2826
2827 // Initialise NZCV to an impossible value, to check that we actually write it.
2828 __ Mov(x10, NZCVFlag);
2829
2830 // If pd.Is(pn), the MacroAssembler simply passes the arguments directly to
2831 // the Assembler.
2832 __ Msr(NZCV, x10);
2833 __ Mov(p0, pn);
2834 (masm.*macro)(p0.WithLaneSize(lane_size_in_bits),
2835 pg,
2836 p0.WithLaneSize(lane_size_in_bits));
2837 __ Mrs(x0, NZCV);
2838
2839 // The MacroAssembler supports non-destructive use.
2840 __ Msr(NZCV, x10);
2841 (masm.*macro)(p1.WithLaneSize(lane_size_in_bits),
2842 pg,
2843 pn.WithLaneSize(lane_size_in_bits));
2844 __ Mrs(x1, NZCV);
2845
2846 // If pd.Aliases(pg) the macro requires a scratch register.
2847 {
2848 UseScratchRegisterScope temps(&masm);
2849 temps.Include(p13);
2850 __ Msr(NZCV, x10);
2851 __ Mov(p2, p15);
2852 (masm.*macro)(p2.WithLaneSize(lane_size_in_bits),
2853 p2,
2854 pn.WithLaneSize(lane_size_in_bits));
2855 __ Mrs(x2, NZCV);
2856 }
2857
2858 END();
2859
2860 if (CAN_RUN()) {
2861 RUN();
2862
2863 // Check that the inputs weren't modified.
2864 ASSERT_EQUAL_SVE(pn_inputs, pn.WithLaneSize(lane_size_in_bits));
2865 ASSERT_EQUAL_SVE(pg_inputs, pg.WithLaneSize(lane_size_in_bits));
2866
2867 // Check the primary operation.
2868 ASSERT_EQUAL_SVE(pd_expected, p0.WithLaneSize(lane_size_in_bits));
2869 ASSERT_EQUAL_SVE(pd_expected, p1.WithLaneSize(lane_size_in_bits));
2870 ASSERT_EQUAL_SVE(pd_expected, p2.WithLaneSize(lane_size_in_bits));
2871
2872 // Check that the flags were properly set.
2873 StatusFlags nzcv_expected =
2874 GetPredTestFlags(pd_expected,
2875 pg_inputs,
2876 core.GetSVELaneCount(kBRegSize));
2877 ASSERT_EQUAL_64(nzcv_expected, x0);
2878 ASSERT_EQUAL_64(nzcv_expected, x1);
2879 ASSERT_EQUAL_64(nzcv_expected, x2);
2880 }
2881}
2882
2883template <typename Tg, typename Tn, typename Td>
Jacob Bramleye8289202019-07-31 11:25:23 +01002884static void PfirstHelper(Test* config,
2885 const Tg& pg_inputs,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002886 const Tn& pn_inputs,
2887 const Td& pd_expected) {
Jacob Bramleye8289202019-07-31 11:25:23 +01002888 PfirstPnextHelper(config,
2889 &MacroAssembler::Pfirst,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002890 kBRegSize, // pfirst only accepts B-sized lanes.
2891 pg_inputs,
2892 pn_inputs,
2893 pd_expected);
2894}
2895
2896template <typename Tg, typename Tn, typename Td>
Jacob Bramleye8289202019-07-31 11:25:23 +01002897static void PnextHelper(Test* config,
2898 unsigned lane_size_in_bits,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002899 const Tg& pg_inputs,
2900 const Tn& pn_inputs,
2901 const Td& pd_expected) {
Jacob Bramleye8289202019-07-31 11:25:23 +01002902 PfirstPnextHelper(config,
2903 &MacroAssembler::Pnext,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002904 lane_size_in_bits,
2905 pg_inputs,
2906 pn_inputs,
2907 pd_expected);
2908}
2909
Jacob Bramleye8289202019-07-31 11:25:23 +01002910TEST_SVE(sve_pfirst) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01002911 // Provide more lanes than kPRegMinSize (to check propagation if we have a
2912 // large VL), but few enough to make the test easy to read.
2913 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2914 int in1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2915 int in2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2916 int in3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2917 int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2918 VIXL_ASSERT(ArrayLength(in0) > kPRegMinSize);
2919
2920 // Pfirst finds the first active lane in pg, and activates the corresponding
2921 // lane in pn (if it isn't already active).
2922
2923 // The first active lane in in1 is here. |
2924 // v
2925 int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
2926 int exp12[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0};
2927 int exp13[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2928 int exp14[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
Jacob Bramleye8289202019-07-31 11:25:23 +01002929 PfirstHelper(config, in1, in0, exp10);
2930 PfirstHelper(config, in1, in2, exp12);
2931 PfirstHelper(config, in1, in3, exp13);
2932 PfirstHelper(config, in1, in4, exp14);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002933
2934 // The first active lane in in2 is here. |
2935 // v
2936 int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
2937 int exp21[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0};
2938 int exp23[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2939 int exp24[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
Jacob Bramleye8289202019-07-31 11:25:23 +01002940 PfirstHelper(config, in2, in0, exp20);
2941 PfirstHelper(config, in2, in1, exp21);
2942 PfirstHelper(config, in2, in3, exp23);
2943 PfirstHelper(config, in2, in4, exp24);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002944
2945 // The first active lane in in3 is here. |
2946 // v
2947 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
2948 int exp31[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
2949 int exp32[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
2950 int exp34[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
Jacob Bramleye8289202019-07-31 11:25:23 +01002951 PfirstHelper(config, in3, in0, exp30);
2952 PfirstHelper(config, in3, in1, exp31);
2953 PfirstHelper(config, in3, in2, exp32);
2954 PfirstHelper(config, in3, in4, exp34);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002955
2956 // | The first active lane in in4 is here.
2957 // v
2958 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2959 int exp41[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2960 int exp42[] = {1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2961 int exp43[] = {1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
Jacob Bramleye8289202019-07-31 11:25:23 +01002962 PfirstHelper(config, in4, in0, exp40);
2963 PfirstHelper(config, in4, in1, exp41);
2964 PfirstHelper(config, in4, in2, exp42);
2965 PfirstHelper(config, in4, in3, exp43);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002966
2967 // If pg is all inactive, the input is passed through unchanged.
Jacob Bramleye8289202019-07-31 11:25:23 +01002968 PfirstHelper(config, in0, in0, in0);
2969 PfirstHelper(config, in0, in1, in1);
2970 PfirstHelper(config, in0, in2, in2);
2971 PfirstHelper(config, in0, in3, in3);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002972
2973 // If the values of pg and pn match, the value is passed through unchanged.
Jacob Bramleye8289202019-07-31 11:25:23 +01002974 PfirstHelper(config, in0, in0, in0);
2975 PfirstHelper(config, in1, in1, in1);
2976 PfirstHelper(config, in2, in2, in2);
2977 PfirstHelper(config, in3, in3, in3);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002978}
2979
Jacob Bramleye8289202019-07-31 11:25:23 +01002980TEST_SVE(sve_pfirst_alias) {
2981 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002982 START();
2983
2984 // Check that the Simulator behaves correctly when all arguments are aliased.
2985 int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
2986 int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
2987 int in_s[] = {0, 1, 1, 0};
2988 int in_d[] = {1, 1};
2989
2990 Initialise(&masm, p0.VnB(), in_b);
2991 Initialise(&masm, p1.VnH(), in_h);
2992 Initialise(&masm, p2.VnS(), in_s);
2993 Initialise(&masm, p3.VnD(), in_d);
2994
2995 // Initialise NZCV to an impossible value, to check that we actually write it.
2996 __ Mov(x10, NZCVFlag);
2997
2998 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01002999 __ Pfirst(p0.VnB(), p0, p0.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003000 __ Mrs(x0, NZCV);
3001
3002 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003003 __ Pfirst(p1.VnB(), p1, p1.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003004 __ Mrs(x1, NZCV);
3005
3006 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003007 __ Pfirst(p2.VnB(), p2, p2.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003008 __ Mrs(x2, NZCV);
3009
3010 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003011 __ Pfirst(p3.VnB(), p3, p3.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003012 __ Mrs(x3, NZCV);
3013
3014 END();
3015
3016 if (CAN_RUN()) {
3017 RUN();
3018
3019 // The first lane from pg is already active in pdn, so the P register should
3020 // be unchanged.
3021 ASSERT_EQUAL_SVE(in_b, p0.VnB());
3022 ASSERT_EQUAL_SVE(in_h, p1.VnH());
3023 ASSERT_EQUAL_SVE(in_s, p2.VnS());
3024 ASSERT_EQUAL_SVE(in_d, p3.VnD());
3025
3026 ASSERT_EQUAL_64(SVEFirstFlag, x0);
3027 ASSERT_EQUAL_64(SVEFirstFlag, x1);
3028 ASSERT_EQUAL_64(SVEFirstFlag, x2);
3029 ASSERT_EQUAL_64(SVEFirstFlag, x3);
3030 }
3031}
3032
Jacob Bramleye8289202019-07-31 11:25:23 +01003033TEST_SVE(sve_pnext_b) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003034 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3035 // (to check propagation if we have a large VL), but few enough to make the
3036 // test easy to read.
3037 // For now, we just use kPRegMinSize so that the test works anywhere.
3038 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3039 int in1[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
3040 int in2[] = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
3041 int in3[] = {0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1};
3042 int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3043
3044 // Pnext activates the next element that is true in pg, after the last-active
3045 // element in pn. If all pn elements are false (as in in0), it starts looking
3046 // at element 0.
3047
3048 // There are no active lanes in in0, so the result is simply the first active
3049 // lane from pg.
3050 int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3051 int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
3052 int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
3053 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
3054 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3055
3056 // The last active lane in in1 is here. |
3057 // v
3058 int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3059 int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3060 int exp21[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3061 int exp31[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3062 int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3063
3064 // | The last active lane in in2 is here.
3065 // v
3066 int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3067 int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3068 int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3069 int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3070 int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3071
3072 // | The last active lane in in3 is here.
3073 // v
3074 int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3075 int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3076 int exp23[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3077 int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3078 int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3079
3080 // | The last active lane in in4 is here.
3081 // v
3082 int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3083 int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3084 int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3085 int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3086 int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3087
Jacob Bramleye8289202019-07-31 11:25:23 +01003088 PnextHelper(config, kBRegSize, in0, in0, exp00);
3089 PnextHelper(config, kBRegSize, in1, in0, exp10);
3090 PnextHelper(config, kBRegSize, in2, in0, exp20);
3091 PnextHelper(config, kBRegSize, in3, in0, exp30);
3092 PnextHelper(config, kBRegSize, in4, in0, exp40);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003093
Jacob Bramleye8289202019-07-31 11:25:23 +01003094 PnextHelper(config, kBRegSize, in0, in1, exp01);
3095 PnextHelper(config, kBRegSize, in1, in1, exp11);
3096 PnextHelper(config, kBRegSize, in2, in1, exp21);
3097 PnextHelper(config, kBRegSize, in3, in1, exp31);
3098 PnextHelper(config, kBRegSize, in4, in1, exp41);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003099
Jacob Bramleye8289202019-07-31 11:25:23 +01003100 PnextHelper(config, kBRegSize, in0, in2, exp02);
3101 PnextHelper(config, kBRegSize, in1, in2, exp12);
3102 PnextHelper(config, kBRegSize, in2, in2, exp22);
3103 PnextHelper(config, kBRegSize, in3, in2, exp32);
3104 PnextHelper(config, kBRegSize, in4, in2, exp42);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003105
Jacob Bramleye8289202019-07-31 11:25:23 +01003106 PnextHelper(config, kBRegSize, in0, in3, exp03);
3107 PnextHelper(config, kBRegSize, in1, in3, exp13);
3108 PnextHelper(config, kBRegSize, in2, in3, exp23);
3109 PnextHelper(config, kBRegSize, in3, in3, exp33);
3110 PnextHelper(config, kBRegSize, in4, in3, exp43);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003111
Jacob Bramleye8289202019-07-31 11:25:23 +01003112 PnextHelper(config, kBRegSize, in0, in4, exp04);
3113 PnextHelper(config, kBRegSize, in1, in4, exp14);
3114 PnextHelper(config, kBRegSize, in2, in4, exp24);
3115 PnextHelper(config, kBRegSize, in3, in4, exp34);
3116 PnextHelper(config, kBRegSize, in4, in4, exp44);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003117}
3118
Jacob Bramleye8289202019-07-31 11:25:23 +01003119TEST_SVE(sve_pnext_h) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003120 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3121 // (to check propagation if we have a large VL), but few enough to make the
3122 // test easy to read.
3123 // For now, we just use kPRegMinSize so that the test works anywhere.
3124 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0};
3125 int in1[] = {0, 0, 0, 1, 0, 2, 1, 0};
3126 int in2[] = {0, 1, 2, 0, 2, 0, 2, 0};
3127 int in3[] = {0, 0, 0, 3, 0, 0, 0, 3};
3128 int in4[] = {3, 0, 0, 0, 0, 0, 0, 0};
3129
3130 // Pnext activates the next element that is true in pg, after the last-active
3131 // element in pn. If all pn elements are false (as in in0), it starts looking
3132 // at element 0.
3133 //
3134 // As for other SVE instructions, elements are only considered to be active if
3135 // the _first_ bit in each field is one. Other bits are ignored.
3136
3137 // There are no active lanes in in0, so the result is simply the first active
3138 // lane from pg.
3139 int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0};
3140 int exp10[] = {0, 0, 0, 0, 0, 0, 1, 0};
3141 int exp20[] = {0, 1, 0, 0, 0, 0, 0, 0};
3142 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 1};
3143 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0};
3144
3145 // | The last active lane in in1 is here.
3146 // v
3147 int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0};
3148 int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0};
3149 int exp21[] = {0, 1, 0, 0, 0, 0, 0, 0};
3150 int exp31[] = {0, 0, 0, 0, 0, 0, 0, 0};
3151 int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0};
3152
3153 // | The last active lane in in2 is here.
3154 // v
3155 int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0};
3156 int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0};
3157 int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0};
3158 int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0};
3159 int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0};
3160
3161 // | The last active lane in in3 is here.
3162 // v
3163 int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0};
3164 int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0};
3165 int exp23[] = {0, 1, 0, 0, 0, 0, 0, 0};
3166 int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0};
3167 int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0};
3168
3169 // | The last active lane in in4 is here.
3170 // v
3171 int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0};
3172 int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0};
3173 int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0};
3174 int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0};
3175 int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0};
3176
Jacob Bramleye8289202019-07-31 11:25:23 +01003177 PnextHelper(config, kHRegSize, in0, in0, exp00);
3178 PnextHelper(config, kHRegSize, in1, in0, exp10);
3179 PnextHelper(config, kHRegSize, in2, in0, exp20);
3180 PnextHelper(config, kHRegSize, in3, in0, exp30);
3181 PnextHelper(config, kHRegSize, in4, in0, exp40);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003182
Jacob Bramleye8289202019-07-31 11:25:23 +01003183 PnextHelper(config, kHRegSize, in0, in1, exp01);
3184 PnextHelper(config, kHRegSize, in1, in1, exp11);
3185 PnextHelper(config, kHRegSize, in2, in1, exp21);
3186 PnextHelper(config, kHRegSize, in3, in1, exp31);
3187 PnextHelper(config, kHRegSize, in4, in1, exp41);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003188
Jacob Bramleye8289202019-07-31 11:25:23 +01003189 PnextHelper(config, kHRegSize, in0, in2, exp02);
3190 PnextHelper(config, kHRegSize, in1, in2, exp12);
3191 PnextHelper(config, kHRegSize, in2, in2, exp22);
3192 PnextHelper(config, kHRegSize, in3, in2, exp32);
3193 PnextHelper(config, kHRegSize, in4, in2, exp42);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003194
Jacob Bramleye8289202019-07-31 11:25:23 +01003195 PnextHelper(config, kHRegSize, in0, in3, exp03);
3196 PnextHelper(config, kHRegSize, in1, in3, exp13);
3197 PnextHelper(config, kHRegSize, in2, in3, exp23);
3198 PnextHelper(config, kHRegSize, in3, in3, exp33);
3199 PnextHelper(config, kHRegSize, in4, in3, exp43);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003200
Jacob Bramleye8289202019-07-31 11:25:23 +01003201 PnextHelper(config, kHRegSize, in0, in4, exp04);
3202 PnextHelper(config, kHRegSize, in1, in4, exp14);
3203 PnextHelper(config, kHRegSize, in2, in4, exp24);
3204 PnextHelper(config, kHRegSize, in3, in4, exp34);
3205 PnextHelper(config, kHRegSize, in4, in4, exp44);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003206}
3207
Jacob Bramleye8289202019-07-31 11:25:23 +01003208TEST_SVE(sve_pnext_s) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003209 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3210 // (to check propagation if we have a large VL), but few enough to make the
3211 // test easy to read.
3212 // For now, we just use kPRegMinSize so that the test works anywhere.
3213 int in0[] = {0xe, 0xc, 0x8, 0x0};
3214 int in1[] = {0x0, 0x2, 0x0, 0x1};
3215 int in2[] = {0x0, 0x1, 0xf, 0x0};
3216 int in3[] = {0xf, 0x0, 0x0, 0x0};
3217
3218 // Pnext activates the next element that is true in pg, after the last-active
3219 // element in pn. If all pn elements are false (as in in0), it starts looking
3220 // at element 0.
3221 //
3222 // As for other SVE instructions, elements are only considered to be active if
3223 // the _first_ bit in each field is one. Other bits are ignored.
3224
3225 // There are no active lanes in in0, so the result is simply the first active
3226 // lane from pg.
3227 int exp00[] = {0, 0, 0, 0};
3228 int exp10[] = {0, 0, 0, 1};
3229 int exp20[] = {0, 0, 1, 0};
3230 int exp30[] = {1, 0, 0, 0};
3231
3232 // | The last active lane in in1 is here.
3233 // v
3234 int exp01[] = {0, 0, 0, 0};
3235 int exp11[] = {0, 0, 0, 0};
3236 int exp21[] = {0, 0, 1, 0};
3237 int exp31[] = {1, 0, 0, 0};
3238
3239 // | The last active lane in in2 is here.
3240 // v
3241 int exp02[] = {0, 0, 0, 0};
3242 int exp12[] = {0, 0, 0, 0};
3243 int exp22[] = {0, 0, 0, 0};
3244 int exp32[] = {1, 0, 0, 0};
3245
3246 // | The last active lane in in3 is here.
3247 // v
3248 int exp03[] = {0, 0, 0, 0};
3249 int exp13[] = {0, 0, 0, 0};
3250 int exp23[] = {0, 0, 0, 0};
3251 int exp33[] = {0, 0, 0, 0};
3252
Jacob Bramleye8289202019-07-31 11:25:23 +01003253 PnextHelper(config, kSRegSize, in0, in0, exp00);
3254 PnextHelper(config, kSRegSize, in1, in0, exp10);
3255 PnextHelper(config, kSRegSize, in2, in0, exp20);
3256 PnextHelper(config, kSRegSize, in3, in0, exp30);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003257
Jacob Bramleye8289202019-07-31 11:25:23 +01003258 PnextHelper(config, kSRegSize, in0, in1, exp01);
3259 PnextHelper(config, kSRegSize, in1, in1, exp11);
3260 PnextHelper(config, kSRegSize, in2, in1, exp21);
3261 PnextHelper(config, kSRegSize, in3, in1, exp31);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003262
Jacob Bramleye8289202019-07-31 11:25:23 +01003263 PnextHelper(config, kSRegSize, in0, in2, exp02);
3264 PnextHelper(config, kSRegSize, in1, in2, exp12);
3265 PnextHelper(config, kSRegSize, in2, in2, exp22);
3266 PnextHelper(config, kSRegSize, in3, in2, exp32);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003267
Jacob Bramleye8289202019-07-31 11:25:23 +01003268 PnextHelper(config, kSRegSize, in0, in3, exp03);
3269 PnextHelper(config, kSRegSize, in1, in3, exp13);
3270 PnextHelper(config, kSRegSize, in2, in3, exp23);
3271 PnextHelper(config, kSRegSize, in3, in3, exp33);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003272}
3273
Jacob Bramleye8289202019-07-31 11:25:23 +01003274TEST_SVE(sve_pnext_d) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003275 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3276 // (to check propagation if we have a large VL), but few enough to make the
3277 // test easy to read.
3278 // For now, we just use kPRegMinSize so that the test works anywhere.
3279 int in0[] = {0xfe, 0xf0};
3280 int in1[] = {0x00, 0x55};
3281 int in2[] = {0x33, 0xff};
3282
3283 // Pnext activates the next element that is true in pg, after the last-active
3284 // element in pn. If all pn elements are false (as in in0), it starts looking
3285 // at element 0.
3286 //
3287 // As for other SVE instructions, elements are only considered to be active if
3288 // the _first_ bit in each field is one. Other bits are ignored.
3289
3290 // There are no active lanes in in0, so the result is simply the first active
3291 // lane from pg.
3292 int exp00[] = {0, 0};
3293 int exp10[] = {0, 1};
3294 int exp20[] = {0, 1};
3295
3296 // | The last active lane in in1 is here.
3297 // v
3298 int exp01[] = {0, 0};
3299 int exp11[] = {0, 0};
3300 int exp21[] = {1, 0};
3301
3302 // | The last active lane in in2 is here.
3303 // v
3304 int exp02[] = {0, 0};
3305 int exp12[] = {0, 0};
3306 int exp22[] = {0, 0};
3307
Jacob Bramleye8289202019-07-31 11:25:23 +01003308 PnextHelper(config, kDRegSize, in0, in0, exp00);
3309 PnextHelper(config, kDRegSize, in1, in0, exp10);
3310 PnextHelper(config, kDRegSize, in2, in0, exp20);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003311
Jacob Bramleye8289202019-07-31 11:25:23 +01003312 PnextHelper(config, kDRegSize, in0, in1, exp01);
3313 PnextHelper(config, kDRegSize, in1, in1, exp11);
3314 PnextHelper(config, kDRegSize, in2, in1, exp21);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003315
Jacob Bramleye8289202019-07-31 11:25:23 +01003316 PnextHelper(config, kDRegSize, in0, in2, exp02);
3317 PnextHelper(config, kDRegSize, in1, in2, exp12);
3318 PnextHelper(config, kDRegSize, in2, in2, exp22);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003319}
3320
Jacob Bramleye8289202019-07-31 11:25:23 +01003321TEST_SVE(sve_pnext_alias) {
3322 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003323 START();
3324
3325 // Check that the Simulator behaves correctly when all arguments are aliased.
3326 int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
3327 int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
3328 int in_s[] = {0, 1, 1, 0};
3329 int in_d[] = {1, 1};
3330
3331 Initialise(&masm, p0.VnB(), in_b);
3332 Initialise(&masm, p1.VnH(), in_h);
3333 Initialise(&masm, p2.VnS(), in_s);
3334 Initialise(&masm, p3.VnD(), in_d);
3335
3336 // Initialise NZCV to an impossible value, to check that we actually write it.
3337 __ Mov(x10, NZCVFlag);
3338
3339 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003340 __ Pnext(p0.VnB(), p0, p0.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003341 __ Mrs(x0, NZCV);
3342
3343 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003344 __ Pnext(p1.VnB(), p1, p1.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003345 __ Mrs(x1, NZCV);
3346
3347 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003348 __ Pnext(p2.VnB(), p2, p2.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003349 __ Mrs(x2, NZCV);
3350
3351 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003352 __ Pnext(p3.VnB(), p3, p3.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003353 __ Mrs(x3, NZCV);
3354
3355 END();
3356
3357 if (CAN_RUN()) {
3358 RUN();
3359
3360 // Since pg.Is(pdn), there can be no active lanes in pg above the last
3361 // active lane in pdn, so the result should always be zero.
3362 ASSERT_EQUAL_SVE(0, p0.VnB());
3363 ASSERT_EQUAL_SVE(0, p1.VnH());
3364 ASSERT_EQUAL_SVE(0, p2.VnS());
3365 ASSERT_EQUAL_SVE(0, p3.VnD());
3366
3367 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x0);
3368 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x1);
3369 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x2);
3370 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x3);
3371 }
3372}
3373
Jacob Bramleye8289202019-07-31 11:25:23 +01003374static void PtrueHelper(Test* config,
3375 unsigned lane_size_in_bits,
Jacob Bramley0ce75842019-07-17 18:12:50 +01003376 FlagsUpdate s = LeaveFlags) {
Jacob Bramleye8289202019-07-31 11:25:23 +01003377 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003378 START();
3379
3380 PRegisterWithLaneSize p[kNumberOfPRegisters];
3381 for (unsigned i = 0; i < kNumberOfPRegisters; i++) {
3382 p[i] = PRegister(i).WithLaneSize(lane_size_in_bits);
3383 }
3384
3385 // Initialise NZCV to an impossible value, to check that we actually write it.
3386 StatusFlags nzcv_unmodified = NZCVFlag;
3387 __ Mov(x20, nzcv_unmodified);
3388
3389 // We don't have enough registers to conveniently test every pattern, so take
3390 // samples from each group.
3391 __ Msr(NZCV, x20);
3392 __ Ptrue(p[0], SVE_POW2, s);
3393 __ Mrs(x0, NZCV);
3394
3395 __ Msr(NZCV, x20);
3396 __ Ptrue(p[1], SVE_VL1, s);
3397 __ Mrs(x1, NZCV);
3398
3399 __ Msr(NZCV, x20);
3400 __ Ptrue(p[2], SVE_VL2, s);
3401 __ Mrs(x2, NZCV);
3402
3403 __ Msr(NZCV, x20);
3404 __ Ptrue(p[3], SVE_VL5, s);
3405 __ Mrs(x3, NZCV);
3406
3407 __ Msr(NZCV, x20);
3408 __ Ptrue(p[4], SVE_VL6, s);
3409 __ Mrs(x4, NZCV);
3410
3411 __ Msr(NZCV, x20);
3412 __ Ptrue(p[5], SVE_VL8, s);
3413 __ Mrs(x5, NZCV);
3414
3415 __ Msr(NZCV, x20);
3416 __ Ptrue(p[6], SVE_VL16, s);
3417 __ Mrs(x6, NZCV);
3418
3419 __ Msr(NZCV, x20);
3420 __ Ptrue(p[7], SVE_VL64, s);
3421 __ Mrs(x7, NZCV);
3422
3423 __ Msr(NZCV, x20);
3424 __ Ptrue(p[8], SVE_VL256, s);
3425 __ Mrs(x8, NZCV);
3426
3427 {
3428 // We have to use the Assembler to use values not defined by
3429 // SVEPredicateConstraint, so call `ptrues` directly..
3430 typedef void (
3431 MacroAssembler::*AssemblePtrueFn)(const PRegisterWithLaneSize& pd,
3432 int pattern);
3433 AssemblePtrueFn assemble =
3434 (s == SetFlags) ? &MacroAssembler::ptrues : &MacroAssembler::ptrue;
3435
3436 ExactAssemblyScope guard(&masm, 12 * kInstructionSize);
3437 __ msr(NZCV, x20);
3438 (masm.*assemble)(p[9], 0xe);
3439 __ mrs(x9, NZCV);
3440
3441 __ msr(NZCV, x20);
3442 (masm.*assemble)(p[10], 0x16);
3443 __ mrs(x10, NZCV);
3444
3445 __ msr(NZCV, x20);
3446 (masm.*assemble)(p[11], 0x1a);
3447 __ mrs(x11, NZCV);
3448
3449 __ msr(NZCV, x20);
3450 (masm.*assemble)(p[12], 0x1c);
3451 __ mrs(x12, NZCV);
3452 }
3453
3454 __ Msr(NZCV, x20);
3455 __ Ptrue(p[13], SVE_MUL4, s);
3456 __ Mrs(x13, NZCV);
3457
3458 __ Msr(NZCV, x20);
3459 __ Ptrue(p[14], SVE_MUL3, s);
3460 __ Mrs(x14, NZCV);
3461
3462 __ Msr(NZCV, x20);
3463 __ Ptrue(p[15], SVE_ALL, s);
3464 __ Mrs(x15, NZCV);
3465
3466 END();
3467
3468 if (CAN_RUN()) {
3469 RUN();
3470
3471 int all = core.GetSVELaneCount(lane_size_in_bits);
3472 int pow2 = 1 << HighestSetBitPosition(all);
3473 int mul4 = all - (all % 4);
3474 int mul3 = all - (all % 3);
3475
3476 // Check P register results.
3477 for (int i = 0; i < all; i++) {
3478 ASSERT_EQUAL_SVE_LANE(i < pow2, p[0], i);
3479 ASSERT_EQUAL_SVE_LANE((all >= 1) && (i < 1), p[1], i);
3480 ASSERT_EQUAL_SVE_LANE((all >= 2) && (i < 2), p[2], i);
3481 ASSERT_EQUAL_SVE_LANE((all >= 5) && (i < 5), p[3], i);
3482 ASSERT_EQUAL_SVE_LANE((all >= 6) && (i < 6), p[4], i);
3483 ASSERT_EQUAL_SVE_LANE((all >= 8) && (i < 8), p[5], i);
3484 ASSERT_EQUAL_SVE_LANE((all >= 16) && (i < 16), p[6], i);
3485 ASSERT_EQUAL_SVE_LANE((all >= 64) && (i < 64), p[7], i);
3486 ASSERT_EQUAL_SVE_LANE((all >= 256) && (i < 256), p[8], i);
3487 ASSERT_EQUAL_SVE_LANE(false, p[9], i);
3488 ASSERT_EQUAL_SVE_LANE(false, p[10], i);
3489 ASSERT_EQUAL_SVE_LANE(false, p[11], i);
3490 ASSERT_EQUAL_SVE_LANE(false, p[12], i);
3491 ASSERT_EQUAL_SVE_LANE(i < mul4, p[13], i);
3492 ASSERT_EQUAL_SVE_LANE(i < mul3, p[14], i);
3493 ASSERT_EQUAL_SVE_LANE(true, p[15], i);
3494 }
3495
3496 // Check NZCV results.
3497 if (s == LeaveFlags) {
3498 // No flags should have been updated.
3499 for (int i = 0; i <= 15; i++) {
3500 ASSERT_EQUAL_64(nzcv_unmodified, XRegister(i));
3501 }
3502 } else {
3503 StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3504 StatusFlags nonzero = SVEFirstFlag;
3505
3506 // POW2
3507 ASSERT_EQUAL_64(nonzero, x0);
3508 // VL*
3509 ASSERT_EQUAL_64((all >= 1) ? nonzero : zero, x1);
3510 ASSERT_EQUAL_64((all >= 2) ? nonzero : zero, x2);
3511 ASSERT_EQUAL_64((all >= 5) ? nonzero : zero, x3);
3512 ASSERT_EQUAL_64((all >= 6) ? nonzero : zero, x4);
3513 ASSERT_EQUAL_64((all >= 8) ? nonzero : zero, x5);
3514 ASSERT_EQUAL_64((all >= 16) ? nonzero : zero, x6);
3515 ASSERT_EQUAL_64((all >= 64) ? nonzero : zero, x7);
3516 ASSERT_EQUAL_64((all >= 256) ? nonzero : zero, x8);
3517 // #uimm5
3518 ASSERT_EQUAL_64(zero, x9);
3519 ASSERT_EQUAL_64(zero, x10);
3520 ASSERT_EQUAL_64(zero, x11);
3521 ASSERT_EQUAL_64(zero, x12);
3522 // MUL*
3523 ASSERT_EQUAL_64((all >= 4) ? nonzero : zero, x13);
3524 ASSERT_EQUAL_64((all >= 3) ? nonzero : zero, x14);
3525 // ALL
3526 ASSERT_EQUAL_64(nonzero, x15);
3527 }
3528 }
3529}
3530
Jacob Bramleye8289202019-07-31 11:25:23 +01003531TEST_SVE(sve_ptrue_b) { PtrueHelper(config, kBRegSize, LeaveFlags); }
3532TEST_SVE(sve_ptrue_h) { PtrueHelper(config, kHRegSize, LeaveFlags); }
3533TEST_SVE(sve_ptrue_s) { PtrueHelper(config, kSRegSize, LeaveFlags); }
3534TEST_SVE(sve_ptrue_d) { PtrueHelper(config, kDRegSize, LeaveFlags); }
Jacob Bramley0ce75842019-07-17 18:12:50 +01003535
Jacob Bramleye8289202019-07-31 11:25:23 +01003536TEST_SVE(sve_ptrues_b) { PtrueHelper(config, kBRegSize, SetFlags); }
3537TEST_SVE(sve_ptrues_h) { PtrueHelper(config, kHRegSize, SetFlags); }
3538TEST_SVE(sve_ptrues_s) { PtrueHelper(config, kSRegSize, SetFlags); }
3539TEST_SVE(sve_ptrues_d) { PtrueHelper(config, kDRegSize, SetFlags); }
Jacob Bramley0ce75842019-07-17 18:12:50 +01003540
Jacob Bramleye8289202019-07-31 11:25:23 +01003541TEST_SVE(sve_pfalse) {
3542 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003543 START();
3544
3545 // Initialise non-zero inputs.
3546 __ Ptrue(p0.VnB());
3547 __ Ptrue(p1.VnH());
3548 __ Ptrue(p2.VnS());
3549 __ Ptrue(p3.VnD());
3550
3551 // The instruction only supports B-sized lanes, but the lane size has no
3552 // logical effect, so the MacroAssembler accepts anything.
3553 __ Pfalse(p0.VnB());
3554 __ Pfalse(p1.VnH());
3555 __ Pfalse(p2.VnS());
3556 __ Pfalse(p3.VnD());
3557
3558 END();
3559
3560 if (CAN_RUN()) {
3561 RUN();
3562
3563 ASSERT_EQUAL_SVE(0, p0.VnB());
3564 ASSERT_EQUAL_SVE(0, p1.VnB());
3565 ASSERT_EQUAL_SVE(0, p2.VnB());
3566 ASSERT_EQUAL_SVE(0, p3.VnB());
3567 }
3568}
3569
Jacob Bramleye8289202019-07-31 11:25:23 +01003570TEST_SVE(sve_ptest) {
3571 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003572 START();
3573
3574 // Initialise NZCV to a known (impossible) value.
3575 StatusFlags nzcv_unmodified = NZCVFlag;
3576 __ Mov(x0, nzcv_unmodified);
3577 __ Msr(NZCV, x0);
3578
3579 // Construct some test inputs.
3580 int in2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0};
3581 int in3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0};
3582 int in4[] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0};
3583 __ Pfalse(p0.VnB());
3584 __ Ptrue(p1.VnB());
3585 Initialise(&masm, p2.VnB(), in2);
3586 Initialise(&masm, p3.VnB(), in3);
3587 Initialise(&masm, p4.VnB(), in4);
3588
3589 // All-inactive pg.
3590 __ Ptest(p0, p0.VnB());
3591 __ Mrs(x0, NZCV);
3592 __ Ptest(p0, p1.VnB());
3593 __ Mrs(x1, NZCV);
3594 __ Ptest(p0, p2.VnB());
3595 __ Mrs(x2, NZCV);
3596 __ Ptest(p0, p3.VnB());
3597 __ Mrs(x3, NZCV);
3598 __ Ptest(p0, p4.VnB());
3599 __ Mrs(x4, NZCV);
3600
3601 // All-active pg.
3602 __ Ptest(p1, p0.VnB());
3603 __ Mrs(x5, NZCV);
3604 __ Ptest(p1, p1.VnB());
3605 __ Mrs(x6, NZCV);
3606 __ Ptest(p1, p2.VnB());
3607 __ Mrs(x7, NZCV);
3608 __ Ptest(p1, p3.VnB());
3609 __ Mrs(x8, NZCV);
3610 __ Ptest(p1, p4.VnB());
3611 __ Mrs(x9, NZCV);
3612
3613 // Combinations of other inputs.
3614 __ Ptest(p2, p2.VnB());
3615 __ Mrs(x20, NZCV);
3616 __ Ptest(p2, p3.VnB());
3617 __ Mrs(x21, NZCV);
3618 __ Ptest(p2, p4.VnB());
3619 __ Mrs(x22, NZCV);
3620 __ Ptest(p3, p2.VnB());
3621 __ Mrs(x23, NZCV);
3622 __ Ptest(p3, p3.VnB());
3623 __ Mrs(x24, NZCV);
3624 __ Ptest(p3, p4.VnB());
3625 __ Mrs(x25, NZCV);
3626 __ Ptest(p4, p2.VnB());
3627 __ Mrs(x26, NZCV);
3628 __ Ptest(p4, p3.VnB());
3629 __ Mrs(x27, NZCV);
3630 __ Ptest(p4, p4.VnB());
3631 __ Mrs(x28, NZCV);
3632
3633 END();
3634
3635 if (CAN_RUN()) {
3636 RUN();
3637
3638 StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3639
3640 // If pg is all inactive, the value of pn is irrelevant.
3641 ASSERT_EQUAL_64(zero, x0);
3642 ASSERT_EQUAL_64(zero, x1);
3643 ASSERT_EQUAL_64(zero, x2);
3644 ASSERT_EQUAL_64(zero, x3);
3645 ASSERT_EQUAL_64(zero, x4);
3646
3647 // All-active pg.
3648 ASSERT_EQUAL_64(zero, x5); // All-inactive pn.
3649 ASSERT_EQUAL_64(SVEFirstFlag, x6); // All-active pn.
3650 // Other pn inputs are non-zero, but the first and last lanes are inactive.
3651 ASSERT_EQUAL_64(SVENotLastFlag, x7);
3652 ASSERT_EQUAL_64(SVENotLastFlag, x8);
3653 ASSERT_EQUAL_64(SVENotLastFlag, x9);
3654
3655 // Other inputs.
3656 ASSERT_EQUAL_64(SVEFirstFlag, x20); // pg: in2, pn: in2
3657 ASSERT_EQUAL_64(NoFlag, x21); // pg: in2, pn: in3
3658 ASSERT_EQUAL_64(zero, x22); // pg: in2, pn: in4
3659 ASSERT_EQUAL_64(static_cast<StatusFlags>(SVEFirstFlag | SVENotLastFlag),
3660 x23); // pg: in3, pn: in2
3661 ASSERT_EQUAL_64(SVEFirstFlag, x24); // pg: in3, pn: in3
3662 ASSERT_EQUAL_64(zero, x25); // pg: in3, pn: in4
3663 ASSERT_EQUAL_64(zero, x26); // pg: in4, pn: in2
3664 ASSERT_EQUAL_64(zero, x27); // pg: in4, pn: in3
3665 ASSERT_EQUAL_64(SVEFirstFlag, x28); // pg: in4, pn: in4
3666 }
3667}
3668
Jacob Bramleye8289202019-07-31 11:25:23 +01003669TEST_SVE(sve_cntp) {
3670 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd961a0c2019-07-17 10:53:45 +01003671 START();
3672
3673 // There are {7, 5, 2, 1} active {B, H, S, D} lanes.
3674 int p0_inputs[] = {0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
3675 Initialise(&masm, p0.VnB(), p0_inputs);
3676
3677 // With an all-true predicate, these instructions measure the vector length.
3678 __ Ptrue(p10.VnB());
3679 __ Ptrue(p11.VnH());
3680 __ Ptrue(p12.VnS());
3681 __ Ptrue(p13.VnD());
3682
3683 // `ptrue p10.b` provides an all-active pg.
3684 __ Cntp(x10, p10, p10.VnB());
3685 __ Cntp(x11, p10, p11.VnH());
3686 __ Cntp(x12, p10, p12.VnS());
3687 __ Cntp(x13, p10, p13.VnD());
3688
3689 // Check that the predicate mask is applied properly.
3690 __ Cntp(x14, p10, p10.VnB());
3691 __ Cntp(x15, p11, p10.VnB());
3692 __ Cntp(x16, p12, p10.VnB());
3693 __ Cntp(x17, p13, p10.VnB());
3694
3695 // Check other patterns (including some ignored bits).
3696 __ Cntp(x0, p10, p0.VnB());
3697 __ Cntp(x1, p10, p0.VnH());
3698 __ Cntp(x2, p10, p0.VnS());
3699 __ Cntp(x3, p10, p0.VnD());
3700 __ Cntp(x4, p0, p10.VnB());
3701 __ Cntp(x5, p0, p10.VnH());
3702 __ Cntp(x6, p0, p10.VnS());
3703 __ Cntp(x7, p0, p10.VnD());
3704
3705 END();
3706
3707 if (CAN_RUN()) {
3708 RUN();
3709
3710 int vl_b = core.GetSVELaneCount(kBRegSize);
3711 int vl_h = core.GetSVELaneCount(kHRegSize);
3712 int vl_s = core.GetSVELaneCount(kSRegSize);
3713 int vl_d = core.GetSVELaneCount(kDRegSize);
3714
3715 // Check all-active predicates in various combinations.
3716 ASSERT_EQUAL_64(vl_b, x10);
3717 ASSERT_EQUAL_64(vl_h, x11);
3718 ASSERT_EQUAL_64(vl_s, x12);
3719 ASSERT_EQUAL_64(vl_d, x13);
3720
3721 ASSERT_EQUAL_64(vl_b, x14);
3722 ASSERT_EQUAL_64(vl_h, x15);
3723 ASSERT_EQUAL_64(vl_s, x16);
3724 ASSERT_EQUAL_64(vl_d, x17);
3725
3726 // Check that irrelevant bits are properly ignored.
3727 ASSERT_EQUAL_64(7, x0);
3728 ASSERT_EQUAL_64(5, x1);
3729 ASSERT_EQUAL_64(2, x2);
3730 ASSERT_EQUAL_64(1, x3);
3731
3732 ASSERT_EQUAL_64(7, x4);
3733 ASSERT_EQUAL_64(5, x5);
3734 ASSERT_EQUAL_64(2, x6);
3735 ASSERT_EQUAL_64(1, x7);
3736 }
3737}
3738
Martyn Capewell74f84f62019-10-30 15:30:44 +00003739typedef void (MacroAssembler::*CntFn)(const Register& dst,
3740 int pattern,
3741 int multiplier);
3742
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003743template <typename T>
3744void GenerateCntSequence(MacroAssembler* masm,
3745 CntFn cnt,
3746 T acc_value,
3747 int multiplier) {
3748 // Initialise accumulators.
3749 masm->Mov(x0, acc_value);
3750 masm->Mov(x1, acc_value);
3751 masm->Mov(x2, acc_value);
3752 masm->Mov(x3, acc_value);
3753 masm->Mov(x4, acc_value);
3754 masm->Mov(x5, acc_value);
3755 masm->Mov(x6, acc_value);
3756 masm->Mov(x7, acc_value);
3757 masm->Mov(x8, acc_value);
3758 masm->Mov(x9, acc_value);
3759 masm->Mov(x10, acc_value);
3760 masm->Mov(x11, acc_value);
3761 masm->Mov(x12, acc_value);
3762 masm->Mov(x13, acc_value);
3763 masm->Mov(x14, acc_value);
3764 masm->Mov(x15, acc_value);
3765 masm->Mov(x18, acc_value);
3766 masm->Mov(x19, acc_value);
3767 masm->Mov(x20, acc_value);
3768 masm->Mov(x21, acc_value);
3769
3770 (masm->*cnt)(Register(0, sizeof(T) * kBitsPerByte), SVE_POW2, multiplier);
3771 (masm->*cnt)(Register(1, sizeof(T) * kBitsPerByte), SVE_VL1, multiplier);
3772 (masm->*cnt)(Register(2, sizeof(T) * kBitsPerByte), SVE_VL2, multiplier);
3773 (masm->*cnt)(Register(3, sizeof(T) * kBitsPerByte), SVE_VL3, multiplier);
3774 (masm->*cnt)(Register(4, sizeof(T) * kBitsPerByte), SVE_VL4, multiplier);
3775 (masm->*cnt)(Register(5, sizeof(T) * kBitsPerByte), SVE_VL5, multiplier);
3776 (masm->*cnt)(Register(6, sizeof(T) * kBitsPerByte), SVE_VL6, multiplier);
3777 (masm->*cnt)(Register(7, sizeof(T) * kBitsPerByte), SVE_VL7, multiplier);
3778 (masm->*cnt)(Register(8, sizeof(T) * kBitsPerByte), SVE_VL8, multiplier);
3779 (masm->*cnt)(Register(9, sizeof(T) * kBitsPerByte), SVE_VL16, multiplier);
3780 (masm->*cnt)(Register(10, sizeof(T) * kBitsPerByte), SVE_VL32, multiplier);
3781 (masm->*cnt)(Register(11, sizeof(T) * kBitsPerByte), SVE_VL64, multiplier);
3782 (masm->*cnt)(Register(12, sizeof(T) * kBitsPerByte), SVE_VL128, multiplier);
3783 (masm->*cnt)(Register(13, sizeof(T) * kBitsPerByte), SVE_VL256, multiplier);
3784 (masm->*cnt)(Register(14, sizeof(T) * kBitsPerByte), 16, multiplier);
3785 (masm->*cnt)(Register(15, sizeof(T) * kBitsPerByte), 23, multiplier);
3786 (masm->*cnt)(Register(18, sizeof(T) * kBitsPerByte), 28, multiplier);
3787 (masm->*cnt)(Register(19, sizeof(T) * kBitsPerByte), SVE_MUL4, multiplier);
3788 (masm->*cnt)(Register(20, sizeof(T) * kBitsPerByte), SVE_MUL3, multiplier);
3789 (masm->*cnt)(Register(21, sizeof(T) * kBitsPerByte), SVE_ALL, multiplier);
3790}
3791
3792int FixedVL(int fixed, int length) {
3793 VIXL_ASSERT(((fixed >= 1) && (fixed <= 8)) || (fixed == 16) ||
3794 (fixed == 32) || (fixed == 64) || (fixed == 128) ||
3795 (fixed = 256));
3796 return (length >= fixed) ? fixed : 0;
3797}
3798
Martyn Capewell74f84f62019-10-30 15:30:44 +00003799static void CntHelper(Test* config,
3800 CntFn cnt,
3801 int multiplier,
Martyn Capewell579c92d2019-10-30 17:48:52 +00003802 int lane_size_in_bits,
3803 int64_t acc_value = 0,
3804 bool is_increment = true) {
Martyn Capewell74f84f62019-10-30 15:30:44 +00003805 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3806 START();
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003807 GenerateCntSequence(&masm, cnt, acc_value, multiplier);
Martyn Capewell74f84f62019-10-30 15:30:44 +00003808 END();
3809
3810 if (CAN_RUN()) {
3811 RUN();
3812
3813 int all = core.GetSVELaneCount(lane_size_in_bits);
3814 int pow2 = 1 << HighestSetBitPosition(all);
3815 int mul4 = all - (all % 4);
3816 int mul3 = all - (all % 3);
3817
Martyn Capewell579c92d2019-10-30 17:48:52 +00003818 multiplier = is_increment ? multiplier : -multiplier;
3819
3820 ASSERT_EQUAL_64(acc_value + (multiplier * pow2), x0);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003821 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(1, all)), x1);
3822 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(2, all)), x2);
3823 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(3, all)), x3);
3824 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(4, all)), x4);
3825 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(5, all)), x5);
3826 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(6, all)), x6);
3827 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(7, all)), x7);
3828 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(8, all)), x8);
3829 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(16, all)), x9);
3830 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(32, all)), x10);
3831 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(64, all)), x11);
3832 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(128, all)), x12);
3833 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(256, all)), x13);
Martyn Capewell579c92d2019-10-30 17:48:52 +00003834 ASSERT_EQUAL_64(acc_value, x14);
3835 ASSERT_EQUAL_64(acc_value, x15);
3836 ASSERT_EQUAL_64(acc_value, x18);
3837 ASSERT_EQUAL_64(acc_value + (multiplier * mul4), x19);
3838 ASSERT_EQUAL_64(acc_value + (multiplier * mul3), x20);
3839 ASSERT_EQUAL_64(acc_value + (multiplier * all), x21);
Martyn Capewell74f84f62019-10-30 15:30:44 +00003840 }
3841}
3842
Martyn Capewell579c92d2019-10-30 17:48:52 +00003843static void IncHelper(Test* config,
3844 CntFn cnt,
3845 int multiplier,
3846 int lane_size_in_bits,
3847 int64_t acc_value) {
3848 CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
3849}
3850
3851static void DecHelper(Test* config,
3852 CntFn cnt,
3853 int multiplier,
3854 int lane_size_in_bits,
3855 int64_t acc_value) {
3856 CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
3857}
3858
Martyn Capewell74f84f62019-10-30 15:30:44 +00003859TEST_SVE(sve_cntb) {
3860 CntHelper(config, &MacroAssembler::Cntb, 1, kBRegSize);
3861 CntHelper(config, &MacroAssembler::Cntb, 2, kBRegSize);
3862 CntHelper(config, &MacroAssembler::Cntb, 15, kBRegSize);
3863 CntHelper(config, &MacroAssembler::Cntb, 16, kBRegSize);
3864}
3865
3866TEST_SVE(sve_cnth) {
3867 CntHelper(config, &MacroAssembler::Cnth, 1, kHRegSize);
3868 CntHelper(config, &MacroAssembler::Cnth, 2, kHRegSize);
3869 CntHelper(config, &MacroAssembler::Cnth, 15, kHRegSize);
3870 CntHelper(config, &MacroAssembler::Cnth, 16, kHRegSize);
3871}
3872
3873TEST_SVE(sve_cntw) {
3874 CntHelper(config, &MacroAssembler::Cntw, 1, kWRegSize);
3875 CntHelper(config, &MacroAssembler::Cntw, 2, kWRegSize);
3876 CntHelper(config, &MacroAssembler::Cntw, 15, kWRegSize);
3877 CntHelper(config, &MacroAssembler::Cntw, 16, kWRegSize);
3878}
3879
3880TEST_SVE(sve_cntd) {
3881 CntHelper(config, &MacroAssembler::Cntd, 1, kDRegSize);
3882 CntHelper(config, &MacroAssembler::Cntd, 2, kDRegSize);
3883 CntHelper(config, &MacroAssembler::Cntd, 15, kDRegSize);
3884 CntHelper(config, &MacroAssembler::Cntd, 16, kDRegSize);
3885}
3886
Martyn Capewell579c92d2019-10-30 17:48:52 +00003887TEST_SVE(sve_decb) {
3888 DecHelper(config, &MacroAssembler::Decb, 1, kBRegSize, 42);
3889 DecHelper(config, &MacroAssembler::Decb, 2, kBRegSize, -1);
3890 DecHelper(config, &MacroAssembler::Decb, 15, kBRegSize, INT64_MIN);
3891 DecHelper(config, &MacroAssembler::Decb, 16, kBRegSize, -42);
3892}
3893
3894TEST_SVE(sve_dech) {
3895 DecHelper(config, &MacroAssembler::Dech, 1, kHRegSize, 42);
3896 DecHelper(config, &MacroAssembler::Dech, 2, kHRegSize, -1);
3897 DecHelper(config, &MacroAssembler::Dech, 15, kHRegSize, INT64_MIN);
3898 DecHelper(config, &MacroAssembler::Dech, 16, kHRegSize, -42);
3899}
3900
3901TEST_SVE(sve_decw) {
3902 DecHelper(config, &MacroAssembler::Decw, 1, kWRegSize, 42);
3903 DecHelper(config, &MacroAssembler::Decw, 2, kWRegSize, -1);
3904 DecHelper(config, &MacroAssembler::Decw, 15, kWRegSize, INT64_MIN);
3905 DecHelper(config, &MacroAssembler::Decw, 16, kWRegSize, -42);
3906}
3907
3908TEST_SVE(sve_decd) {
3909 DecHelper(config, &MacroAssembler::Decd, 1, kDRegSize, 42);
3910 DecHelper(config, &MacroAssembler::Decd, 2, kDRegSize, -1);
3911 DecHelper(config, &MacroAssembler::Decd, 15, kDRegSize, INT64_MIN);
3912 DecHelper(config, &MacroAssembler::Decd, 16, kDRegSize, -42);
3913}
3914
3915TEST_SVE(sve_incb) {
3916 IncHelper(config, &MacroAssembler::Incb, 1, kBRegSize, 42);
3917 IncHelper(config, &MacroAssembler::Incb, 2, kBRegSize, -1);
3918 IncHelper(config, &MacroAssembler::Incb, 15, kBRegSize, INT64_MAX);
3919 IncHelper(config, &MacroAssembler::Incb, 16, kBRegSize, -42);
3920}
3921
3922TEST_SVE(sve_inch) {
3923 IncHelper(config, &MacroAssembler::Inch, 1, kHRegSize, 42);
3924 IncHelper(config, &MacroAssembler::Inch, 2, kHRegSize, -1);
3925 IncHelper(config, &MacroAssembler::Inch, 15, kHRegSize, INT64_MAX);
3926 IncHelper(config, &MacroAssembler::Inch, 16, kHRegSize, -42);
3927}
3928
3929TEST_SVE(sve_incw) {
3930 IncHelper(config, &MacroAssembler::Incw, 1, kWRegSize, 42);
3931 IncHelper(config, &MacroAssembler::Incw, 2, kWRegSize, -1);
3932 IncHelper(config, &MacroAssembler::Incw, 15, kWRegSize, INT64_MAX);
3933 IncHelper(config, &MacroAssembler::Incw, 16, kWRegSize, -42);
3934}
3935
3936TEST_SVE(sve_incd) {
3937 IncHelper(config, &MacroAssembler::Incd, 1, kDRegSize, 42);
3938 IncHelper(config, &MacroAssembler::Incd, 2, kDRegSize, -1);
3939 IncHelper(config, &MacroAssembler::Incd, 15, kDRegSize, INT64_MAX);
3940 IncHelper(config, &MacroAssembler::Incd, 16, kDRegSize, -42);
3941}
3942
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003943template <typename T>
3944static T QAdd(T x, int y) {
3945 VIXL_ASSERT(y > INT_MIN);
3946 T result;
3947 T min = std::numeric_limits<T>::min();
3948 T max = std::numeric_limits<T>::max();
3949 if ((x >= 0) && (y >= 0)) {
3950 // For positive a and b, saturate at max.
3951 result = (max - x) < static_cast<T>(y) ? max : x + y;
3952 } else if ((y < 0) && ((x < 0) || (min == 0))) {
3953 // For negative b, where either a negative or T unsigned.
3954 result = (x - min) < static_cast<T>(-y) ? min : x + y;
3955 } else {
3956 result = x + y;
3957 }
3958 return result;
3959}
3960
3961template <typename T>
3962static void QIncDecHelper(Test* config,
3963 CntFn cnt,
3964 int multiplier,
3965 int lane_size_in_bits,
3966 T acc_value,
3967 bool is_increment) {
3968 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3969 START();
3970 GenerateCntSequence(&masm, cnt, acc_value, multiplier);
3971 END();
3972
3973 if (CAN_RUN()) {
3974 RUN();
3975
3976 int all = core.GetSVELaneCount(lane_size_in_bits);
3977 int pow2 = 1 << HighestSetBitPosition(all);
3978 int mul4 = all - (all % 4);
3979 int mul3 = all - (all % 3);
3980
3981 multiplier = is_increment ? multiplier : -multiplier;
3982
3983 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
3984 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
3985 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
3986 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
3987 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
3988 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
3989 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
3990 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
3991 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
3992 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
3993 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
3994 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
3995 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
3996 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
3997 ASSERT_EQUAL_64(acc_value, x14);
3998 ASSERT_EQUAL_64(acc_value, x15);
3999 ASSERT_EQUAL_64(acc_value, x18);
4000 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
4001 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
4002 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
4003 }
4004}
4005
4006template <typename T>
4007static void QIncHelper(Test* config,
4008 CntFn cnt,
4009 int multiplier,
4010 int lane_size_in_bits,
4011 T acc_value) {
4012 QIncDecHelper<T>(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
4013}
4014
4015template <typename T>
4016static void QDecHelper(Test* config,
4017 CntFn cnt,
4018 int multiplier,
4019 int lane_size_in_bits,
4020 T acc_value) {
4021 QIncDecHelper<T>(config,
4022 cnt,
4023 multiplier,
4024 lane_size_in_bits,
4025 acc_value,
4026 false);
4027}
4028
4029TEST_SVE(sve_sqdecb) {
4030 int64_t bigneg = INT64_MIN + 42;
4031 int64_t bigpos = INT64_MAX - 42;
4032 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
4033 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 2, kBRegSize, bigneg);
4034 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
4035 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 16, kBRegSize, bigpos);
4036}
4037
4038TEST_SVE(sve_sqdech) {
4039 int64_t bigneg = INT64_MIN + 42;
4040 int64_t bigpos = INT64_MAX - 42;
4041 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
4042 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 2, kHRegSize, bigneg);
4043 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
4044 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 16, kHRegSize, bigpos);
4045}
4046
4047TEST_SVE(sve_sqdecw) {
4048 int64_t bigneg = INT64_MIN + 42;
4049 int64_t bigpos = INT64_MAX - 42;
4050 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4051 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 2, kWRegSize, bigneg);
4052 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4053 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 16, kWRegSize, bigpos);
4054}
4055
4056TEST_SVE(sve_sqdecd) {
4057 int64_t bigneg = INT64_MIN + 42;
4058 int64_t bigpos = INT64_MAX - 42;
4059 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4060 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 2, kDRegSize, bigneg);
4061 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4062 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 16, kDRegSize, bigpos);
4063}
4064
4065TEST_SVE(sve_sqincb) {
4066 int64_t bigneg = INT64_MIN + 42;
4067 int64_t bigpos = INT64_MAX - 42;
4068 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4069 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 2, kBRegSize, bigneg);
4070 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4071 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 16, kBRegSize, bigpos);
4072}
4073
4074TEST_SVE(sve_sqinch) {
4075 int64_t bigneg = INT64_MIN + 42;
4076 int64_t bigpos = INT64_MAX - 42;
4077 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4078 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 2, kHRegSize, bigneg);
4079 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4080 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 16, kHRegSize, bigpos);
4081}
4082
4083TEST_SVE(sve_sqincw) {
4084 int64_t bigneg = INT64_MIN + 42;
4085 int64_t bigpos = INT64_MAX - 42;
4086 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4087 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 2, kWRegSize, bigneg);
4088 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4089 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 16, kWRegSize, bigpos);
4090}
4091
4092TEST_SVE(sve_sqincd) {
4093 int64_t bigneg = INT64_MIN + 42;
4094 int64_t bigpos = INT64_MAX - 42;
4095 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4096 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 2, kDRegSize, bigneg);
4097 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4098 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 16, kDRegSize, bigpos);
4099}
4100
4101TEST_SVE(sve_uqdecb) {
4102 int32_t big32 = UINT32_MAX - 42;
4103 int64_t big64 = UINT64_MAX - 42;
4104 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4105 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4106 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4107 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big32);
4108 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4109 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4110 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4111 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big64);
4112}
4113
4114TEST_SVE(sve_uqdech) {
4115 int32_t big32 = UINT32_MAX - 42;
4116 int64_t big64 = UINT64_MAX - 42;
4117 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4118 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4119 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4120 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big32);
4121 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4122 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4123 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4124 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big64);
4125}
4126
4127TEST_SVE(sve_uqdecw) {
4128 int32_t big32 = UINT32_MAX - 42;
4129 int64_t big64 = UINT64_MAX - 42;
4130 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4131 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4132 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4133 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big32);
4134 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4135 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4136 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4137 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big64);
4138}
4139
4140TEST_SVE(sve_uqdecd) {
4141 int32_t big32 = UINT32_MAX - 42;
4142 int64_t big64 = UINT64_MAX - 42;
4143 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4144 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4145 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4146 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big32);
4147 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4148 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4149 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4150 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big64);
4151}
4152
4153TEST_SVE(sve_uqincb) {
4154 int32_t big32 = UINT32_MAX - 42;
4155 int64_t big64 = UINT64_MAX - 42;
4156 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4157 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4158 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4159 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big32);
4160 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4161 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4162 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4163 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big64);
4164}
4165
4166TEST_SVE(sve_uqinch) {
4167 int32_t big32 = UINT32_MAX - 42;
4168 int64_t big64 = UINT64_MAX - 42;
4169 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4170 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4171 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4172 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big32);
4173 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4174 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4175 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4176 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big64);
4177}
4178
4179TEST_SVE(sve_uqincw) {
4180 int32_t big32 = UINT32_MAX - 42;
4181 int64_t big64 = UINT64_MAX - 42;
4182 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4183 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4184 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4185 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big32);
4186 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4187 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4188 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4189 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big64);
4190}
4191
4192TEST_SVE(sve_uqincd) {
4193 int32_t big32 = UINT32_MAX - 42;
4194 int64_t big64 = UINT64_MAX - 42;
4195 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4196 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4197 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4198 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big32);
4199 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4200 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4201 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4202 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big64);
4203}
4204
4205typedef void (MacroAssembler::*QIncDecXWFn)(const Register& dst,
4206 const Register& src,
4207 int pattern,
4208 int multiplier);
4209
4210static void QIncDecXWHelper(Test* config,
4211 QIncDecXWFn cnt,
4212 int multiplier,
4213 int lane_size_in_bits,
4214 int32_t acc_value,
4215 bool is_increment) {
4216 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4217 START();
4218
4219 // Initialise accumulators.
4220 __ Mov(x0, acc_value);
4221 __ Mov(x1, acc_value);
4222 __ Mov(x2, acc_value);
4223 __ Mov(x3, acc_value);
4224 __ Mov(x4, acc_value);
4225 __ Mov(x5, acc_value);
4226 __ Mov(x6, acc_value);
4227 __ Mov(x7, acc_value);
4228 __ Mov(x8, acc_value);
4229 __ Mov(x9, acc_value);
4230 __ Mov(x10, acc_value);
4231 __ Mov(x11, acc_value);
4232 __ Mov(x12, acc_value);
4233 __ Mov(x13, acc_value);
4234 __ Mov(x14, acc_value);
4235 __ Mov(x15, acc_value);
4236 __ Mov(x18, acc_value);
4237 __ Mov(x19, acc_value);
4238 __ Mov(x20, acc_value);
4239 __ Mov(x21, acc_value);
4240
4241 (masm.*cnt)(x0, w0, SVE_POW2, multiplier);
4242 (masm.*cnt)(x1, w1, SVE_VL1, multiplier);
4243 (masm.*cnt)(x2, w2, SVE_VL2, multiplier);
4244 (masm.*cnt)(x3, w3, SVE_VL3, multiplier);
4245 (masm.*cnt)(x4, w4, SVE_VL4, multiplier);
4246 (masm.*cnt)(x5, w5, SVE_VL5, multiplier);
4247 (masm.*cnt)(x6, w6, SVE_VL6, multiplier);
4248 (masm.*cnt)(x7, w7, SVE_VL7, multiplier);
4249 (masm.*cnt)(x8, w8, SVE_VL8, multiplier);
4250 (masm.*cnt)(x9, w9, SVE_VL16, multiplier);
4251 (masm.*cnt)(x10, w10, SVE_VL32, multiplier);
4252 (masm.*cnt)(x11, w11, SVE_VL64, multiplier);
4253 (masm.*cnt)(x12, w12, SVE_VL128, multiplier);
4254 (masm.*cnt)(x13, w13, SVE_VL256, multiplier);
4255 (masm.*cnt)(x14, w14, 16, multiplier);
4256 (masm.*cnt)(x15, w15, 23, multiplier);
4257 (masm.*cnt)(x18, w18, 28, multiplier);
4258 (masm.*cnt)(x19, w19, SVE_MUL4, multiplier);
4259 (masm.*cnt)(x20, w20, SVE_MUL3, multiplier);
4260 (masm.*cnt)(x21, w21, SVE_ALL, multiplier);
4261
4262 END();
4263
4264 if (CAN_RUN()) {
4265 RUN();
4266
4267 int all = core.GetSVELaneCount(lane_size_in_bits);
4268 int pow2 = 1 << HighestSetBitPosition(all);
4269 int mul4 = all - (all % 4);
4270 int mul3 = all - (all % 3);
4271
4272 multiplier = is_increment ? multiplier : -multiplier;
4273
4274 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
4275 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
4276 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
4277 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
4278 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
4279 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
4280 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
4281 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
4282 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
4283 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
4284 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
4285 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
4286 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
4287 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
4288 ASSERT_EQUAL_64(acc_value, x14);
4289 ASSERT_EQUAL_64(acc_value, x15);
4290 ASSERT_EQUAL_64(acc_value, x18);
4291 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
4292 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
4293 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
4294 }
4295}
4296
4297static void QIncXWHelper(Test* config,
4298 QIncDecXWFn cnt,
4299 int multiplier,
4300 int lane_size_in_bits,
4301 int32_t acc_value) {
4302 QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
4303}
4304
4305static void QDecXWHelper(Test* config,
4306 QIncDecXWFn cnt,
4307 int multiplier,
4308 int lane_size_in_bits,
4309 int32_t acc_value) {
4310 QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
4311}
4312
4313TEST_SVE(sve_sqdecb_xw) {
4314 QDecXWHelper(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
4315 QDecXWHelper(config, &MacroAssembler::Sqdecb, 2, kBRegSize, INT32_MIN + 42);
4316 QDecXWHelper(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
4317 QDecXWHelper(config, &MacroAssembler::Sqdecb, 16, kBRegSize, INT32_MAX - 42);
4318}
4319
4320TEST_SVE(sve_sqdech_xw) {
4321 QDecXWHelper(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
4322 QDecXWHelper(config, &MacroAssembler::Sqdech, 2, kHRegSize, INT32_MIN + 42);
4323 QDecXWHelper(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
4324 QDecXWHelper(config, &MacroAssembler::Sqdech, 16, kHRegSize, INT32_MAX - 42);
4325}
4326
4327TEST_SVE(sve_sqdecw_xw) {
4328 QDecXWHelper(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4329 QDecXWHelper(config, &MacroAssembler::Sqdecw, 2, kWRegSize, INT32_MIN + 42);
4330 QDecXWHelper(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4331 QDecXWHelper(config, &MacroAssembler::Sqdecw, 16, kWRegSize, INT32_MAX - 42);
4332}
4333
4334TEST_SVE(sve_sqdecd_xw) {
4335 QDecXWHelper(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4336 QDecXWHelper(config, &MacroAssembler::Sqdecd, 2, kDRegSize, INT32_MIN + 42);
4337 QDecXWHelper(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4338 QDecXWHelper(config, &MacroAssembler::Sqdecd, 16, kDRegSize, INT32_MAX - 42);
4339}
4340
4341TEST_SVE(sve_sqincb_xw) {
4342 QIncXWHelper(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4343 QIncXWHelper(config, &MacroAssembler::Sqincb, 2, kBRegSize, INT32_MIN + 42);
4344 QIncXWHelper(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4345 QIncXWHelper(config, &MacroAssembler::Sqincb, 16, kBRegSize, INT32_MAX - 42);
4346}
4347
4348TEST_SVE(sve_sqinch_xw) {
4349 QIncXWHelper(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4350 QIncXWHelper(config, &MacroAssembler::Sqinch, 2, kHRegSize, INT32_MIN + 42);
4351 QIncXWHelper(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4352 QIncXWHelper(config, &MacroAssembler::Sqinch, 16, kHRegSize, INT32_MAX - 42);
4353}
4354
4355TEST_SVE(sve_sqincw_xw) {
4356 QIncXWHelper(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4357 QIncXWHelper(config, &MacroAssembler::Sqincw, 2, kWRegSize, INT32_MIN + 42);
4358 QIncXWHelper(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4359 QIncXWHelper(config, &MacroAssembler::Sqincw, 16, kWRegSize, INT32_MAX - 42);
4360}
4361
4362TEST_SVE(sve_sqincd_xw) {
4363 QIncXWHelper(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4364 QIncXWHelper(config, &MacroAssembler::Sqincd, 2, kDRegSize, INT32_MIN + 42);
4365 QIncXWHelper(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4366 QIncXWHelper(config, &MacroAssembler::Sqincd, 16, kDRegSize, INT32_MAX - 42);
4367}
4368
Martyn Capewell8188ddf2019-11-21 17:09:34 +00004369typedef void (MacroAssembler::*IncDecZFn)(const ZRegister& dst,
4370 int pattern,
4371 int multiplier);
4372typedef void (MacroAssembler::*AddSubFn)(const ZRegister& dst,
4373 const ZRegister& src1,
4374 const ZRegister& src2);
4375
4376static void IncDecZHelper(Test* config,
4377 IncDecZFn fn,
4378 CntFn cnt,
4379 AddSubFn addsub,
4380 int multiplier,
4381 int lane_size_in_bits) {
4382 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4383 START();
4384
4385 uint64_t acc_inputs[] = {0x7766554433221100,
4386 0xffffffffffffffff,
4387 0x0000000000000000,
4388 0xffffffff0000ffff,
4389 0x7fffffffffffffff,
4390 0x8000000000000000,
4391 0x7fffffff7fff7fff,
4392 0x8000000080008000};
4393
4394 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
4395 for (int j = 0; j < 4; j++) {
4396 InsrHelper(&masm, ZRegister(i, kDRegSize), acc_inputs);
4397 }
4398 }
4399 for (unsigned i = 0; i < 15; i++) {
4400 __ Mov(XRegister(i), 0);
4401 }
4402
4403 (masm.*fn)(z16.WithLaneSize(lane_size_in_bits), SVE_POW2, multiplier);
4404 (masm.*fn)(z17.WithLaneSize(lane_size_in_bits), SVE_VL1, multiplier);
4405 (masm.*fn)(z18.WithLaneSize(lane_size_in_bits), SVE_VL2, multiplier);
4406 (masm.*fn)(z19.WithLaneSize(lane_size_in_bits), SVE_VL3, multiplier);
4407 (masm.*fn)(z20.WithLaneSize(lane_size_in_bits), SVE_VL4, multiplier);
4408 (masm.*fn)(z21.WithLaneSize(lane_size_in_bits), SVE_VL7, multiplier);
4409 (masm.*fn)(z22.WithLaneSize(lane_size_in_bits), SVE_VL8, multiplier);
4410 (masm.*fn)(z23.WithLaneSize(lane_size_in_bits), SVE_VL16, multiplier);
4411 (masm.*fn)(z24.WithLaneSize(lane_size_in_bits), SVE_VL64, multiplier);
4412 (masm.*fn)(z25.WithLaneSize(lane_size_in_bits), SVE_VL256, multiplier);
4413 (masm.*fn)(z26.WithLaneSize(lane_size_in_bits), 16, multiplier);
4414 (masm.*fn)(z27.WithLaneSize(lane_size_in_bits), 28, multiplier);
4415 (masm.*fn)(z28.WithLaneSize(lane_size_in_bits), SVE_MUL3, multiplier);
4416 (masm.*fn)(z29.WithLaneSize(lane_size_in_bits), SVE_MUL4, multiplier);
4417 (masm.*fn)(z30.WithLaneSize(lane_size_in_bits), SVE_ALL, multiplier);
4418
4419 // Perform computation using alternative instructions.
4420 (masm.*cnt)(x0, SVE_POW2, multiplier);
4421 (masm.*cnt)(x1, SVE_VL1, multiplier);
4422 (masm.*cnt)(x2, SVE_VL2, multiplier);
4423 (masm.*cnt)(x3, SVE_VL3, multiplier);
4424 (masm.*cnt)(x4, SVE_VL4, multiplier);
4425 (masm.*cnt)(x5, SVE_VL7, multiplier);
4426 (masm.*cnt)(x6, SVE_VL8, multiplier);
4427 (masm.*cnt)(x7, SVE_VL16, multiplier);
4428 (masm.*cnt)(x8, SVE_VL64, multiplier);
4429 (masm.*cnt)(x9, SVE_VL256, multiplier);
4430 (masm.*cnt)(x10, 16, multiplier);
4431 (masm.*cnt)(x11, 28, multiplier);
4432 (masm.*cnt)(x12, SVE_MUL3, multiplier);
4433 (masm.*cnt)(x13, SVE_MUL4, multiplier);
4434 (masm.*cnt)(x14, SVE_ALL, multiplier);
4435
4436 ZRegister zscratch = z15.WithLaneSize(lane_size_in_bits);
4437 for (unsigned i = 0; i < 15; i++) {
4438 ZRegister zsrcdst = ZRegister(i, lane_size_in_bits);
4439 Register x = Register(i, kXRegSize);
4440 __ Dup(zscratch, x);
4441 (masm.*addsub)(zsrcdst, zsrcdst, zscratch);
4442 }
4443
4444 END();
4445
4446 if (CAN_RUN()) {
4447 RUN();
4448
4449 ASSERT_EQUAL_SVE(z0, z16);
4450 ASSERT_EQUAL_SVE(z1, z17);
4451 ASSERT_EQUAL_SVE(z2, z18);
4452 ASSERT_EQUAL_SVE(z3, z19);
4453 ASSERT_EQUAL_SVE(z4, z20);
4454 ASSERT_EQUAL_SVE(z5, z21);
4455 ASSERT_EQUAL_SVE(z6, z22);
4456 ASSERT_EQUAL_SVE(z7, z23);
4457 ASSERT_EQUAL_SVE(z8, z24);
4458 ASSERT_EQUAL_SVE(z9, z25);
4459 ASSERT_EQUAL_SVE(z10, z26);
4460 ASSERT_EQUAL_SVE(z11, z27);
4461 ASSERT_EQUAL_SVE(z12, z28);
4462 ASSERT_EQUAL_SVE(z13, z29);
4463 ASSERT_EQUAL_SVE(z14, z30);
4464 }
4465}
4466
4467TEST_SVE(sve_inc_dec_vec) {
4468 CntFn cnth = &MacroAssembler::Cnth;
4469 CntFn cntw = &MacroAssembler::Cntw;
4470 CntFn cntd = &MacroAssembler::Cntd;
4471 AddSubFn sub = &MacroAssembler::Sub;
4472 AddSubFn add = &MacroAssembler::Add;
4473 for (int mult = 1; mult <= 16; mult += 5) {
4474 IncDecZHelper(config, &MacroAssembler::Dech, cnth, sub, mult, kHRegSize);
4475 IncDecZHelper(config, &MacroAssembler::Decw, cntw, sub, mult, kSRegSize);
4476 IncDecZHelper(config, &MacroAssembler::Decd, cntd, sub, mult, kDRegSize);
4477 IncDecZHelper(config, &MacroAssembler::Inch, cnth, add, mult, kHRegSize);
4478 IncDecZHelper(config, &MacroAssembler::Incw, cntw, add, mult, kSRegSize);
4479 IncDecZHelper(config, &MacroAssembler::Incd, cntd, add, mult, kDRegSize);
4480 }
4481}
4482
4483TEST_SVE(sve_unsigned_sat_inc_dec_vec) {
4484 CntFn cnth = &MacroAssembler::Cnth;
4485 CntFn cntw = &MacroAssembler::Cntw;
4486 CntFn cntd = &MacroAssembler::Cntd;
4487 AddSubFn sub = &MacroAssembler::Uqsub;
4488 AddSubFn add = &MacroAssembler::Uqadd;
4489 for (int mult = 1; mult <= 16; mult += 5) {
4490 IncDecZHelper(config, &MacroAssembler::Uqdech, cnth, sub, mult, kHRegSize);
4491 IncDecZHelper(config, &MacroAssembler::Uqdecw, cntw, sub, mult, kSRegSize);
4492 IncDecZHelper(config, &MacroAssembler::Uqdecd, cntd, sub, mult, kDRegSize);
4493 IncDecZHelper(config, &MacroAssembler::Uqinch, cnth, add, mult, kHRegSize);
4494 IncDecZHelper(config, &MacroAssembler::Uqincw, cntw, add, mult, kSRegSize);
4495 IncDecZHelper(config, &MacroAssembler::Uqincd, cntd, add, mult, kDRegSize);
4496 }
4497}
4498
4499TEST_SVE(sve_signed_sat_inc_dec_vec) {
4500 CntFn cnth = &MacroAssembler::Cnth;
4501 CntFn cntw = &MacroAssembler::Cntw;
4502 CntFn cntd = &MacroAssembler::Cntd;
4503 AddSubFn sub = &MacroAssembler::Sqsub;
4504 AddSubFn add = &MacroAssembler::Sqadd;
4505 for (int mult = 1; mult <= 16; mult += 5) {
4506 IncDecZHelper(config, &MacroAssembler::Sqdech, cnth, sub, mult, kHRegSize);
4507 IncDecZHelper(config, &MacroAssembler::Sqdecw, cntw, sub, mult, kSRegSize);
4508 IncDecZHelper(config, &MacroAssembler::Sqdecd, cntd, sub, mult, kDRegSize);
4509 IncDecZHelper(config, &MacroAssembler::Sqinch, cnth, add, mult, kHRegSize);
4510 IncDecZHelper(config, &MacroAssembler::Sqincw, cntw, add, mult, kSRegSize);
4511 IncDecZHelper(config, &MacroAssembler::Sqincd, cntd, add, mult, kDRegSize);
4512 }
4513}
4514
TatWai Chong7a0d3672019-10-23 17:35:18 -07004515typedef void (MacroAssembler::*ArithPredicatedFn)(const ZRegister& zd,
4516 const PRegisterM& pg,
4517 const ZRegister& zn,
4518 const ZRegister& zm);
TatWai Chong13634762019-07-16 16:20:45 -07004519
4520template <typename Td, typename Tg, typename Tn>
4521static void IntBinArithHelper(Test* config,
TatWai Chong7a0d3672019-10-23 17:35:18 -07004522 ArithPredicatedFn macro,
TatWai Chong13634762019-07-16 16:20:45 -07004523 unsigned lane_size_in_bits,
4524 const Tg& pg_inputs,
4525 const Tn& zn_inputs,
4526 const Tn& zm_inputs,
4527 const Td& zd_expected) {
4528 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4529 START();
4530
4531 ZRegister src_a = z31.WithLaneSize(lane_size_in_bits);
4532 ZRegister src_b = z27.WithLaneSize(lane_size_in_bits);
4533 InsrHelper(&masm, src_a, zn_inputs);
4534 InsrHelper(&masm, src_b, zm_inputs);
4535
4536 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
4537
4538 ZRegister zd_1 = z0.WithLaneSize(lane_size_in_bits);
4539 ZRegister zd_2 = z1.WithLaneSize(lane_size_in_bits);
4540 ZRegister zd_3 = z2.WithLaneSize(lane_size_in_bits);
4541
4542 // `instr` zd(dst), zd(src_a), zn(src_b)
4543 __ Mov(zd_1, src_a);
4544 (masm.*macro)(zd_1, p0.Merging(), zd_1, src_b);
4545
4546 // `instr` zd(dst), zm(src_a), zd(src_b)
4547 // Based on whether zd and zm registers are aliased, the macro of instructions
4548 // (`Instr`) swaps the order of operands if it has the commutative property,
4549 // otherwise, transfer to the reversed `Instr`, such as subr and divr.
4550 __ Mov(zd_2, src_b);
4551 (masm.*macro)(zd_2, p0.Merging(), src_a, zd_2);
4552
4553 // `instr` zd(dst), zm(src_a), zn(src_b)
4554 // The macro of instructions (`Instr`) automatically selects between `instr`
4555 // and movprfx + `instr` based on whether zd and zn registers are aliased.
TatWai Chongd316c5e2019-10-16 12:22:10 -07004556 // A generated movprfx instruction is predicated that using the same
TatWai Chong13634762019-07-16 16:20:45 -07004557 // governing predicate register. In order to keep the result constant,
4558 // initialize the destination register first.
4559 __ Mov(zd_3, src_a);
4560 (masm.*macro)(zd_3, p0.Merging(), src_a, src_b);
4561
4562 END();
4563
4564 if (CAN_RUN()) {
4565 RUN();
4566 ASSERT_EQUAL_SVE(zd_expected, zd_1);
4567
4568 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
4569 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
4570 if (!core.HasSVELane(zd_1, lane)) break;
TatWai Chongd316c5e2019-10-16 12:22:10 -07004571 if ((pg_inputs[i] & 1) != 0) {
TatWai Chong13634762019-07-16 16:20:45 -07004572 ASSERT_EQUAL_SVE_LANE(zd_expected[i], zd_1, lane);
4573 } else {
4574 ASSERT_EQUAL_SVE_LANE(zn_inputs[i], zd_1, lane);
4575 }
4576 }
4577
4578 ASSERT_EQUAL_SVE(zd_expected, zd_3);
4579 }
4580}
4581
4582TEST_SVE(sve_binary_arithmetic_predicated_add) {
4583 // clang-format off
4584 unsigned zn_b[] = {0x00, 0x01, 0x10, 0x81, 0xff, 0x0f, 0x01, 0x7f};
4585
4586 unsigned zm_b[] = {0x00, 0x01, 0x10, 0x00, 0x81, 0x80, 0xff, 0xff};
4587
4588 unsigned zn_h[] = {0x0000, 0x0123, 0x1010, 0x8181, 0xffff, 0x0f0f, 0x0101, 0x7f7f};
4589
4590 unsigned zm_h[] = {0x0000, 0x0123, 0x1010, 0x0000, 0x8181, 0x8080, 0xffff, 0xffff};
4591
4592 unsigned zn_s[] = {0x00000000, 0x01234567, 0x10101010, 0x81818181,
4593 0xffffffff, 0x0f0f0f0f, 0x01010101, 0x7f7f7f7f};
4594
4595 unsigned zm_s[] = {0x00000000, 0x01234567, 0x10101010, 0x00000000,
4596 0x81818181, 0x80808080, 0xffffffff, 0xffffffff};
4597
4598 uint64_t zn_d[] = {0x0000000000000000, 0x0123456789abcdef,
4599 0x1010101010101010, 0x8181818181818181,
4600 0xffffffffffffffff, 0x0f0f0f0f0f0f0f0f,
4601 0x0101010101010101, 0x7f7f7f7fffffffff};
4602
4603 uint64_t zm_d[] = {0x0000000000000000, 0x0123456789abcdef,
4604 0x1010101010101010, 0x0000000000000000,
4605 0x8181818181818181, 0x8080808080808080,
4606 0xffffffffffffffff, 0xffffffffffffffff};
4607
4608 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4609 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4610 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4611 int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4612
4613 unsigned add_exp_b[] = {0x00, 0x02, 0x20, 0x81, 0x80, 0x8f, 0x00, 0x7f};
4614
4615 unsigned add_exp_h[] = {0x0000, 0x0246, 0x1010, 0x8181,
4616 0x8180, 0x8f8f, 0x0101, 0x7f7e};
4617
4618 unsigned add_exp_s[] = {0x00000000, 0x01234567, 0x20202020, 0x81818181,
4619 0x81818180, 0x0f0f0f0f, 0x01010100, 0x7f7f7f7e};
4620
4621 uint64_t add_exp_d[] = {0x0000000000000000, 0x02468acf13579bde,
4622 0x2020202020202020, 0x8181818181818181,
4623 0xffffffffffffffff, 0x8f8f8f8f8f8f8f8f,
4624 0x0101010101010100, 0x7f7f7f7ffffffffe};
4625
TatWai Chong7a0d3672019-10-23 17:35:18 -07004626 ArithPredicatedFn fn = &MacroAssembler::Add;
TatWai Chong13634762019-07-16 16:20:45 -07004627 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, add_exp_b);
4628 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, add_exp_h);
4629 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, add_exp_s);
4630 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, add_exp_d);
4631
4632 unsigned sub_exp_b[] = {0x00, 0x00, 0x00, 0x81, 0x7e, 0x8f, 0x02, 0x7f};
4633
4634 unsigned sub_exp_h[] = {0x0000, 0x0000, 0x1010, 0x8181,
4635 0x7e7e, 0x8e8f, 0x0101, 0x7f80};
4636
4637 unsigned sub_exp_s[] = {0x00000000, 0x01234567, 0x00000000, 0x81818181,
4638 0x7e7e7e7e, 0x0f0f0f0f, 0x01010102, 0x7f7f7f80};
4639
4640 uint64_t sub_exp_d[] = {0x0000000000000000, 0x0000000000000000,
4641 0x0000000000000000, 0x8181818181818181,
4642 0xffffffffffffffff, 0x8e8e8e8e8e8e8e8f,
4643 0x0101010101010102, 0x7f7f7f8000000000};
4644
4645 fn = &MacroAssembler::Sub;
4646 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sub_exp_b);
4647 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sub_exp_h);
4648 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sub_exp_s);
4649 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sub_exp_d);
4650 // clang-format on
4651}
4652
4653TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd) {
4654 // clang-format off
4655 unsigned zn_b[] = {0x00, 0xff, 0x0f, 0xff, 0xf0, 0x98, 0x55, 0x67};
4656
4657 unsigned zm_b[] = {0x01, 0x00, 0x0e, 0xfe, 0xfe, 0xab, 0xcd, 0x78};
4658
4659 unsigned zn_h[] = {0x0000, 0xffff, 0x00ff, 0xffff,
4660 0xff00, 0xba98, 0x5555, 0x4567};
4661
4662 unsigned zm_h[] = {0x0001, 0x0000, 0x00ee, 0xfffe,
4663 0xfe00, 0xabab, 0xcdcd, 0x5678};
4664
4665 unsigned zn_s[] = {0x00000000, 0xffffffff, 0x0000ffff, 0xffffffff,
4666 0xffff0000, 0xfedcba98, 0x55555555, 0x01234567};
4667
4668 unsigned zm_s[] = {0x00000001, 0x00000000, 0x0000eeee, 0xfffffffe,
4669 0xfffe0000, 0xabababab, 0xcdcdcdcd, 0x12345678};
4670
4671 uint64_t zn_d[] = {0x0000000000000000, 0xffffffffffffffff,
4672 0x5555555555555555, 0x0000000001234567};
4673
4674 uint64_t zm_d[] = {0x0000000000000001, 0x0000000000000000,
4675 0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4676
4677 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4678 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4679 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4680 int pg_d[] = {1, 0, 1, 1};
4681
4682 unsigned umax_exp_b[] = {0x01, 0xff, 0x0f, 0xff, 0xfe, 0xab, 0xcd, 0x67};
4683
4684 unsigned umax_exp_h[] = {0x0001, 0xffff, 0x00ff, 0xffff,
4685 0xff00, 0xba98, 0x5555, 0x5678};
4686
4687 unsigned umax_exp_s[] = {0x00000001, 0xffffffff, 0x0000ffff, 0xffffffff,
4688 0xffff0000, 0xfedcba98, 0xcdcdcdcd, 0x12345678};
4689
4690 uint64_t umax_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4691 0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4692
TatWai Chong7a0d3672019-10-23 17:35:18 -07004693 ArithPredicatedFn fn = &MacroAssembler::Umax;
TatWai Chong13634762019-07-16 16:20:45 -07004694 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umax_exp_b);
4695 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umax_exp_h);
4696 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umax_exp_s);
4697 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umax_exp_d);
4698
4699 unsigned umin_exp_b[] = {0x00, 0x00, 0x0e, 0xff, 0xf0, 0x98, 0x55, 0x67};
4700
4701 unsigned umin_exp_h[] = {0x0000, 0x0000, 0x00ff, 0xfffe,
4702 0xfe00, 0xabab, 0x5555, 0x4567};
4703
4704 unsigned umin_exp_s[] = {0x00000000, 0xffffffff, 0x0000eeee, 0xfffffffe,
4705 0xfffe0000, 0xfedcba98, 0x55555555, 0x01234567};
4706
4707 uint64_t umin_exp_d[] = {0x0000000000000000, 0xffffffffffffffff,
4708 0x5555555555555555, 0x0000000001234567};
4709 fn = &MacroAssembler::Umin;
4710 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umin_exp_b);
4711 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umin_exp_h);
4712 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umin_exp_s);
4713 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umin_exp_d);
4714
4715 unsigned uabd_exp_b[] = {0x01, 0xff, 0x01, 0xff, 0x0e, 0x13, 0x78, 0x67};
4716
4717 unsigned uabd_exp_h[] = {0x0001, 0xffff, 0x00ff, 0x0001,
4718 0x0100, 0x0eed, 0x5555, 0x1111};
4719
4720 unsigned uabd_exp_s[] = {0x00000001, 0xffffffff, 0x00001111, 0x00000001,
4721 0x00010000, 0xfedcba98, 0x78787878, 0x11111111};
4722
4723 uint64_t uabd_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4724 0x7878787878787878, 0x0000000011111111};
4725
4726 fn = &MacroAssembler::Uabd;
4727 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, uabd_exp_b);
4728 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, uabd_exp_h);
4729 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, uabd_exp_s);
4730 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, uabd_exp_d);
4731 // clang-format on
4732}
4733
4734TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd) {
4735 // clang-format off
4736 int zn_b[] = {0, -128, -128, -128, -128, 127, 127, 1};
4737
4738 int zm_b[] = {-1, 0, -1, -127, 127, 126, -1, 0};
4739
4740 int zn_h[] = {0, INT16_MIN, INT16_MIN, INT16_MIN,
4741 INT16_MIN, INT16_MAX, INT16_MAX, 1};
4742
4743 int zm_h[] = {-1, 0, -1, INT16_MIN + 1,
4744 INT16_MAX, INT16_MAX - 1, -1, 0};
4745
4746 int zn_s[] = {0, INT32_MIN, INT32_MIN, INT32_MIN,
4747 INT32_MIN, INT32_MAX, INT32_MAX, 1};
4748
4749 int zm_s[] = {-1, 0, -1, -INT32_MAX,
4750 INT32_MAX, INT32_MAX - 1, -1, 0};
4751
4752 int64_t zn_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4753 INT64_MIN, INT64_MAX, INT64_MAX, 1};
4754
4755 int64_t zm_d[] = {-1, 0, -1, INT64_MIN + 1,
4756 INT64_MAX, INT64_MAX - 1, -1, 0};
4757
4758 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4759 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4760 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4761 int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4762
4763 int smax_exp_b[] = {0, 0, -1, -128, 127, 127, 127, 1};
4764
4765 int smax_exp_h[] = {0, 0, INT16_MIN, INT16_MIN + 1,
4766 INT16_MAX, INT16_MAX, INT16_MAX, 1};
4767
4768 int smax_exp_s[] = {0, INT32_MIN, -1, INT32_MIN + 1,
4769 INT32_MAX, INT32_MAX, INT32_MAX, 1};
4770
4771 int64_t smax_exp_d[] = {0, 0, -1, INT64_MIN + 1,
4772 INT64_MIN, INT64_MAX, INT64_MAX, 1};
4773
TatWai Chong7a0d3672019-10-23 17:35:18 -07004774 ArithPredicatedFn fn = &MacroAssembler::Smax;
TatWai Chong13634762019-07-16 16:20:45 -07004775 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smax_exp_b);
4776 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smax_exp_h);
4777 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smax_exp_s);
4778 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smax_exp_d);
4779
4780 int smin_exp_b[] = {-1, -128, -128, -128, -128, 126, -1, 1};
4781
4782 int smin_exp_h[] = {-1, INT16_MIN, INT16_MIN, INT16_MIN,
4783 INT16_MIN, INT16_MAX - 1, INT16_MAX, 0};
4784
4785 int smin_exp_s[] = {-1, INT32_MIN, INT32_MIN, INT32_MIN,
4786 INT32_MIN, INT32_MAX, -1, 0};
4787
4788 int64_t smin_exp_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4789 INT64_MIN, INT64_MAX - 1, -1, 0};
4790
4791 fn = &MacroAssembler::Smin;
4792 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smin_exp_b);
4793 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smin_exp_h);
4794 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smin_exp_s);
4795 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smin_exp_d);
4796
4797 unsigned sabd_exp_b[] = {1, 128, 127, 128, 255, 1, 128, 1};
4798
4799 unsigned sabd_exp_h[] = {1, 0x8000, 0x8000, 1, 0xffff, 1, 0x7fff, 1};
4800
4801 unsigned sabd_exp_s[] = {1, 0x80000000, 0x7fffffff, 1,
4802 0xffffffff, 0x7fffffff, 0x80000000, 1};
4803
4804 uint64_t sabd_exp_d[] = {0, 0x8000000000000000, 0x7fffffffffffffff, 1,
4805 0x8000000000000000, 1, 0x8000000000000000, 1};
4806
4807 fn = &MacroAssembler::Sabd;
4808 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sabd_exp_b);
4809 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sabd_exp_h);
4810 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sabd_exp_s);
4811 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sabd_exp_d);
4812 // clang-format on
4813}
4814
4815TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh) {
4816 // clang-format off
4817 unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4818
4819 unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4820
4821 unsigned zn_h[] = {0x0000, 0x0001, 0x0020, 0x0800,
4822 0x8000, 0xff00, 0x5555, 0xaaaa};
4823
4824 unsigned zm_h[] = {0x007f, 0x00cd, 0x0800, 0xffff,
4825 0x5555, 0xaaaa, 0x0001, 0x1234};
4826
4827 unsigned zn_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4828 0x12345678, 0xffffffff, 0x55555555, 0xaaaaaaaa};
4829
4830 unsigned zm_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4831 0x12345678, 0x22223333, 0x55556666, 0x77778888};
4832
4833 uint64_t zn_d[] = {0x0000000000000000, 0x5555555555555555,
4834 0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa};
4835
4836 uint64_t zm_d[] = {0x0000000000000000, 0x1111111133333333,
4837 0xddddddddeeeeeeee, 0xaaaaaaaaaaaaaaaa};
4838
4839 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4840 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4841 int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4842 int pg_d[] = {1, 1, 0, 1};
4843
4844 unsigned mul_exp_b[] = {0x00, 0xcd, 0x00, 0xf8, 0x80, 0x56, 0x00, 0x50};
4845
4846 unsigned mul_exp_h[] = {0x0000, 0x0001, 0x0000, 0xf800,
4847 0x8000, 0xff00, 0x5555, 0x9e88};
4848
4849 unsigned mul_exp_s[] = {0x00000000, 0x00000001, 0x00200020, 0x00400000,
4850 0x1df4d840, 0xddddcccd, 0x55555555, 0xb05afa50};
4851
4852 uint64_t mul_exp_d[] = {0x0000000000000000, 0xa4fa4fa4eeeeeeef,
4853 0xffffffffffffffff, 0x38e38e38e38e38e4};
4854
TatWai Chong7a0d3672019-10-23 17:35:18 -07004855 ArithPredicatedFn fn = &MacroAssembler::Mul;
TatWai Chong13634762019-07-16 16:20:45 -07004856 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, mul_exp_b);
4857 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, mul_exp_h);
4858 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, mul_exp_s);
4859 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, mul_exp_d);
4860
4861 unsigned umulh_exp_b[] = {0x00, 0x00, 0x10, 0x07, 0x80, 0xa9, 0x00, 0x05};
4862
4863 unsigned umulh_exp_h[] = {0x0000, 0x0001, 0x0001, 0x07ff,
4864 0x2aaa, 0xff00, 0x0000, 0x0c22};
4865
4866 unsigned umulh_exp_s[] = {0x00000000, 0x00000000, 0x00200020, 0x00400080,
4867 0x014b66dc, 0x22223332, 0x55555555, 0x4fa505af};
4868
4869 uint64_t umulh_exp_d[] = {0x0000000000000000, 0x05b05b05bbbbbbbb,
4870 0xffffffffffffffff, 0x71c71c71c71c71c6};
4871
4872 fn = &MacroAssembler::Umulh;
4873 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umulh_exp_b);
4874 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umulh_exp_h);
4875 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umulh_exp_s);
4876 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umulh_exp_d);
4877 // clang-format on
4878}
4879
4880TEST_SVE(sve_binary_arithmetic_predicated_smulh) {
4881 // clang-format off
4882 int zn_b[] = {0, 1, -1, INT8_MIN, INT8_MAX, -1, 100, -3};
4883
4884 int zm_b[] = {0, INT8_MIN, INT8_MIN, INT8_MAX, INT8_MAX, -1, 2, 66};
4885
4886 int zn_h[] = {0, 1, -1, INT16_MIN, INT16_MAX, -1, 10000, -3};
4887
4888 int zm_h[] = {0, INT16_MIN, INT16_MIN, INT16_MAX, INT16_MAX, -1, 2, 6666};
4889
4890 int zn_s[] = {0, 1, -1, INT32_MIN, INT32_MAX, -1, 100000000, -3};
4891
4892 int zm_s[] = {0, INT32_MIN, INT32_MIN, INT32_MAX, INT32_MAX, -1, 2, 66666666};
4893
4894 int64_t zn_d[] = {0, -1, INT64_MIN, INT64_MAX};
4895
4896 int64_t zm_d[] = {INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX};
4897
4898 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4899 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4900 int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4901 int pg_d[] = {1, 1, 0, 1};
4902
4903 int exp_b[] = {0, -1, 0, -64, INT8_MAX, 0, 0, -1};
4904
4905 int exp_h[] = {0, 1, 0, -16384, 16383, -1, 0, -1};
4906
4907 int exp_s[] = {0, -1, -1, -1073741824, 1073741823, 0, 100000000, -1};
4908
4909 int64_t exp_d[] = {0, -1, INT64_MIN, 4611686018427387903};
4910
TatWai Chong7a0d3672019-10-23 17:35:18 -07004911 ArithPredicatedFn fn = &MacroAssembler::Smulh;
TatWai Chong13634762019-07-16 16:20:45 -07004912 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, exp_b);
4913 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, exp_h);
4914 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
4915 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
4916 // clang-format on
4917}
4918
4919TEST_SVE(sve_binary_arithmetic_predicated_logical) {
4920 // clang-format off
4921 unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4922 unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4923
4924 unsigned zn_h[] = {0x0000, 0x0001, 0x2020, 0x0008,
4925 0x8000, 0xffff, 0x5555, 0xaaaa};
4926 unsigned zm_h[] = {0x7fff, 0xabcd, 0x8000, 0xffff,
4927 0x5555, 0xaaaa, 0x0000, 0x0800};
4928
4929 unsigned zn_s[] = {0x00000001, 0x20200008, 0x8000ffff, 0x5555aaaa};
4930 unsigned zm_s[] = {0x7fffabcd, 0x8000ffff, 0x5555aaaa, 0x00000800};
4931
4932 uint64_t zn_d[] = {0xfedcba9876543210, 0x0123456789abcdef,
4933 0x0001200880ff55aa, 0x0022446688aaccee};
4934 uint64_t zm_d[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff,
4935 0x7fcd80ff55aa0008, 0x1133557799bbddff};
4936
4937 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4938 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4939 int pg_s[] = {1, 1, 1, 0};
4940 int pg_d[] = {1, 1, 0, 1};
4941
4942 unsigned and_exp_b[] = {0x00, 0x01, 0x00, 0x08, 0x80, 0xaa, 0x00, 0x08};
4943
4944 unsigned and_exp_h[] = {0x0000, 0x0001, 0x0000, 0x0008,
4945 0x0000, 0xffff, 0x0000, 0x0800};
4946
4947 unsigned and_exp_s[] = {0x00000001, 0x00000008, 0x0000aaaa, 0x5555aaaa};
4948
4949 uint64_t and_exp_d[] = {0xfedcaa8854540000, 0x0000454588aacdef,
4950 0x0001200880ff55aa, 0x0022446688aaccee};
4951
TatWai Chong7a0d3672019-10-23 17:35:18 -07004952 ArithPredicatedFn fn = &MacroAssembler::And;
TatWai Chong13634762019-07-16 16:20:45 -07004953 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, and_exp_b);
4954 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, and_exp_h);
4955 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, and_exp_s);
4956 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, and_exp_d);
4957
4958 unsigned bic_exp_b[] = {0x00, 0x00, 0x20, 0x00, 0x80, 0x55, 0x55, 0xa2};
4959
4960 unsigned bic_exp_h[] = {0x0000, 0x0001, 0x2020, 0x0000,
4961 0x8000, 0xffff, 0x5555, 0xa2aa};
4962
4963 unsigned bic_exp_s[] = {0x00000000, 0x20200000, 0x80005555, 0x5555aaaa};
4964
4965 uint64_t bic_exp_d[] = {0x0000101022003210, 0x0123002201010000,
4966 0x0001200880ff55aa, 0x0000000000000000};
4967
4968 fn = &MacroAssembler::Bic;
4969 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, bic_exp_b);
4970 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, bic_exp_h);
4971 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, bic_exp_s);
4972 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, bic_exp_d);
4973
4974 unsigned eor_exp_b[] = {0x00, 0xcc, 0xa0, 0xf7, 0x80, 0x55, 0x55, 0xa2};
4975
4976 unsigned eor_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xfff7,
4977 0xd555, 0xffff, 0x5555, 0xa2aa};
4978
4979 unsigned eor_exp_s[] = {0x7fffabcc, 0xa020fff7, 0xd5555555, 0x5555aaaa};
4980
4981 uint64_t eor_exp_d[] = {0x01235476ab89fedc, 0xcdef98ba67453210,
4982 0x0001200880ff55aa, 0x1111111111111111};
4983
4984 fn = &MacroAssembler::Eor;
4985 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, eor_exp_b);
4986 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, eor_exp_h);
4987 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, eor_exp_s);
4988 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, eor_exp_d);
4989
4990 unsigned orr_exp_b[] = {0x00, 0xcd, 0xa0, 0xff, 0x80, 0xff, 0x55, 0xaa};
4991
4992 unsigned orr_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xffff,
4993 0xd555, 0xffff, 0x5555, 0xaaaa};
4994
4995 unsigned orr_exp_s[] = {0x7fffabcd, 0xa020ffff, 0xd555ffff, 0x5555aaaa};
4996
4997 uint64_t orr_exp_d[] = {0xfffffefeffddfedc, 0xcdefddffefefffff,
4998 0x0001200880ff55aa, 0x1133557799bbddff};
4999
5000 fn = &MacroAssembler::Orr;
5001 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, orr_exp_b);
5002 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, orr_exp_h);
5003 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, orr_exp_s);
5004 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, orr_exp_d);
5005 // clang-format on
5006}
5007
5008TEST_SVE(sve_binary_arithmetic_predicated_sdiv) {
5009 // clang-format off
5010 int zn_s[] = {0, 1, -1, 2468,
5011 INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX,
5012 -11111111, 87654321, 0, 0};
5013
5014 int zm_s[] = {1, -1, 1, 1234,
5015 -1, INT32_MIN, 1, -1,
5016 22222222, 80000000, -1, 0};
5017
5018 int64_t zn_d[] = {0, 1, -1, 2468,
5019 INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX,
5020 -11111111, 87654321, 0, 0};
5021
5022 int64_t zm_d[] = {1, -1, 1, 1234,
5023 -1, INT64_MIN, 1, -1,
5024 22222222, 80000000, -1, 0};
5025
5026 int pg_s[] = {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0};
5027 int pg_d[] = {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1};
5028
5029 int exp_s[] = {0, 1, -1, 2,
5030 INT32_MIN, 0, INT32_MIN, -INT32_MAX,
5031 0, 1, 0, 0};
5032
5033 int64_t exp_d[] = {0, -1, -1, 2,
5034 INT64_MIN, INT64_MAX, INT64_MIN, -INT64_MAX,
5035 0, 1, 0, 0};
5036
TatWai Chong7a0d3672019-10-23 17:35:18 -07005037 ArithPredicatedFn fn = &MacroAssembler::Sdiv;
TatWai Chong13634762019-07-16 16:20:45 -07005038 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
5039 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
5040 // clang-format on
5041}
5042
5043TEST_SVE(sve_binary_arithmetic_predicated_udiv) {
5044 // clang-format off
5045 unsigned zn_s[] = {0x00000000, 0x00000001, 0xffffffff, 0x80000000,
5046 0xffffffff, 0x80000000, 0xffffffff, 0x0000f000};
5047
5048 unsigned zm_s[] = {0x00000001, 0xffffffff, 0x80000000, 0x00000002,
5049 0x00000000, 0x00000001, 0x00008000, 0xf0000000};
5050
5051 uint64_t zn_d[] = {0x0000000000000000, 0x0000000000000001,
5052 0xffffffffffffffff, 0x8000000000000000,
5053 0xffffffffffffffff, 0x8000000000000000,
5054 0xffffffffffffffff, 0xf0000000f0000000};
5055
5056 uint64_t zm_d[] = {0x0000000000000001, 0xffffffff00000000,
5057 0x8000000000000000, 0x0000000000000002,
5058 0x8888888888888888, 0x0000000000000001,
5059 0x0000000080000000, 0x00000000f0000000};
5060
5061 int pg_s[] = {1, 1, 0, 1, 1, 0, 1, 1};
5062 int pg_d[] = {1, 0, 1, 1, 1, 1, 0, 1};
5063
5064 unsigned exp_s[] = {0x00000000, 0x00000000, 0xffffffff, 0x40000000,
5065 0x00000000, 0x80000000, 0x0001ffff, 0x00000000};
5066
5067 uint64_t exp_d[] = {0x0000000000000000, 0x0000000000000001,
5068 0x0000000000000001, 0x4000000000000000,
5069 0x0000000000000001, 0x8000000000000000,
5070 0xffffffffffffffff, 0x0000000100000001};
5071
TatWai Chong7a0d3672019-10-23 17:35:18 -07005072 ArithPredicatedFn fn = &MacroAssembler::Udiv;
TatWai Chong13634762019-07-16 16:20:45 -07005073 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
5074 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
5075 // clang-format on
5076}
5077
TatWai Chong7a0d3672019-10-23 17:35:18 -07005078typedef void (MacroAssembler::*ArithFn)(const ZRegister& zd,
5079 const ZRegister& zn,
5080 const ZRegister& zm);
TatWai Chong845246b2019-08-08 00:01:58 -07005081
5082template <typename T>
5083static void IntArithHelper(Test* config,
TatWai Chong7a0d3672019-10-23 17:35:18 -07005084 ArithFn macro,
TatWai Chong845246b2019-08-08 00:01:58 -07005085 unsigned lane_size_in_bits,
5086 const T& zn_inputs,
5087 const T& zm_inputs,
5088 const T& zd_expected) {
5089 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5090 START();
5091
5092 ZRegister zn = z31.WithLaneSize(lane_size_in_bits);
5093 ZRegister zm = z27.WithLaneSize(lane_size_in_bits);
5094 InsrHelper(&masm, zn, zn_inputs);
5095 InsrHelper(&masm, zm, zm_inputs);
5096
5097 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
5098 (masm.*macro)(zd, zn, zm);
5099
5100 END();
5101
5102 if (CAN_RUN()) {
5103 RUN();
5104 ASSERT_EQUAL_SVE(zd_expected, zd);
5105 }
5106}
5107
5108TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd) {
5109 // clang-format off
TatWai Chong6995bfd2019-09-26 10:48:05 +01005110 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xaa, 0x55, 0xff, 0xf0};
5111 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa, 0x5555, 0xffff, 0xf0f0};
5112 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0x10001010, 0xaaaaaaaa, 0xf000f0f0};
5113 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
TatWai Chong845246b2019-08-08 00:01:58 -07005114 0x1000000010001010, 0xf0000000f000f0f0};
5115
TatWai Chong7a0d3672019-10-23 17:35:18 -07005116 ArithFn fn = &MacroAssembler::Add;
TatWai Chong845246b2019-08-08 00:01:58 -07005117
5118 unsigned add_exp_b[] = {0x02, 0xfe, 0x20, 0x54, 0xaa, 0xfe, 0xe0};
5119 unsigned add_exp_h[] = {0x0302, 0xfefe, 0x2020, 0x5554, 0xaaaa, 0xfffe, 0xe1e0};
5120 unsigned add_exp_s[] = {0x00030302, 0xfffefefe, 0x20002020, 0x55555554, 0xe001e1e0};
5121 uint64_t add_exp_d[] = {0x0000000300030302, 0xfffffffefffefefe,
5122 0x2000000020002020, 0xe0000001e001e1e0};
5123
TatWai Chong6995bfd2019-09-26 10:48:05 +01005124 IntArithHelper(config, fn, kBRegSize, in_b, in_b, add_exp_b);
5125 IntArithHelper(config, fn, kHRegSize, in_h, in_h, add_exp_h);
5126 IntArithHelper(config, fn, kSRegSize, in_s, in_s, add_exp_s);
5127 IntArithHelper(config, fn, kDRegSize, in_d, in_d, add_exp_d);
TatWai Chong845246b2019-08-08 00:01:58 -07005128
5129 fn = &MacroAssembler::Sqadd;
5130
5131 unsigned sqadd_exp_b[] = {0x80, 0x7f, 0x20, 0x80, 0x7f, 0xfe, 0xe0};
5132 unsigned sqadd_exp_h[] = {0x8000, 0x7fff, 0x2020, 0x8000, 0x7fff, 0xfffe, 0xe1e0};
5133 unsigned sqadd_exp_s[] = {0x80000000, 0x7fffffff, 0x20002020, 0x80000000, 0xe001e1e0};
5134 uint64_t sqadd_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5135 0x2000000020002020, 0xe0000001e001e1e0};
5136
TatWai Chong6995bfd2019-09-26 10:48:05 +01005137 IntArithHelper(config, fn, kBRegSize, in_b, in_b, sqadd_exp_b);
5138 IntArithHelper(config, fn, kHRegSize, in_h, in_h, sqadd_exp_h);
5139 IntArithHelper(config, fn, kSRegSize, in_s, in_s, sqadd_exp_s);
5140 IntArithHelper(config, fn, kDRegSize, in_d, in_d, sqadd_exp_d);
TatWai Chong845246b2019-08-08 00:01:58 -07005141
5142 fn = &MacroAssembler::Uqadd;
5143
5144 unsigned uqadd_exp_b[] = {0xff, 0xfe, 0x20, 0xff, 0xaa, 0xff, 0xff};
5145 unsigned uqadd_exp_h[] = {0xffff, 0xfefe, 0x2020, 0xffff, 0xaaaa, 0xffff, 0xffff};
5146 unsigned uqadd_exp_s[] = {0xffffffff, 0xfffefefe, 0x20002020, 0xffffffff, 0xffffffff};
5147 uint64_t uqadd_exp_d[] = {0xffffffffffffffff, 0xfffffffefffefefe,
5148 0x2000000020002020, 0xffffffffffffffff};
5149
TatWai Chong6995bfd2019-09-26 10:48:05 +01005150 IntArithHelper(config, fn, kBRegSize, in_b, in_b, uqadd_exp_b);
5151 IntArithHelper(config, fn, kHRegSize, in_h, in_h, uqadd_exp_h);
5152 IntArithHelper(config, fn, kSRegSize, in_s, in_s, uqadd_exp_s);
5153 IntArithHelper(config, fn, kDRegSize, in_d, in_d, uqadd_exp_d);
TatWai Chong845246b2019-08-08 00:01:58 -07005154 // clang-format on
5155}
5156
5157TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub) {
5158 // clang-format off
5159
5160 unsigned ins1_b[] = {0x81, 0x7f, 0x7e, 0xaa};
5161 unsigned ins2_b[] = {0x10, 0xf0, 0xf0, 0x55};
5162
5163 unsigned ins1_h[] = {0x8181, 0x7f7f, 0x7e7e, 0xaaaa};
5164 unsigned ins2_h[] = {0x1010, 0xf0f0, 0xf0f0, 0x5555};
5165
5166 unsigned ins1_s[] = {0x80018181, 0x7fff7f7f, 0x7eee7e7e, 0xaaaaaaaa};
5167 unsigned ins2_s[] = {0x10001010, 0xf000f0f0, 0xf000f0f0, 0x55555555};
5168
5169 uint64_t ins1_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
5170 0x7eeeeeee7eee7e7e, 0xaaaaaaaaaaaaaaaa};
5171 uint64_t ins2_d[] = {0x1000000010001010, 0xf0000000f000f0f0,
5172 0xf0000000f000f0f0, 0x5555555555555555};
5173
TatWai Chong7a0d3672019-10-23 17:35:18 -07005174 ArithFn fn = &MacroAssembler::Sub;
TatWai Chong845246b2019-08-08 00:01:58 -07005175
5176 unsigned ins1_sub_ins2_exp_b[] = {0x71, 0x8f, 0x8e, 0x55};
5177 unsigned ins1_sub_ins2_exp_h[] = {0x7171, 0x8e8f, 0x8d8e, 0x5555};
5178 unsigned ins1_sub_ins2_exp_s[] = {0x70017171, 0x8ffe8e8f, 0x8eed8d8e, 0x55555555};
5179 uint64_t ins1_sub_ins2_exp_d[] = {0x7000000170017171, 0x8ffffffe8ffe8e8f,
5180 0x8eeeeeed8eed8d8e, 0x5555555555555555};
5181
5182 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sub_ins2_exp_b);
5183 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sub_ins2_exp_h);
5184 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sub_ins2_exp_s);
5185 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sub_ins2_exp_d);
5186
5187 unsigned ins2_sub_ins1_exp_b[] = {0x8f, 0x71, 0x72, 0xab};
5188 unsigned ins2_sub_ins1_exp_h[] = {0x8e8f, 0x7171, 0x7272, 0xaaab};
5189 unsigned ins2_sub_ins1_exp_s[] = {0x8ffe8e8f, 0x70017171, 0x71127272, 0xaaaaaaab};
5190 uint64_t ins2_sub_ins1_exp_d[] = {0x8ffffffe8ffe8e8f, 0x7000000170017171,
5191 0x7111111271127272, 0xaaaaaaaaaaaaaaab};
5192
5193 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sub_ins1_exp_b);
5194 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sub_ins1_exp_h);
5195 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sub_ins1_exp_s);
5196 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sub_ins1_exp_d);
5197
5198 fn = &MacroAssembler::Sqsub;
5199
5200 unsigned ins1_sqsub_ins2_exp_b[] = {0x80, 0x7f, 0x7f, 0x80};
5201 unsigned ins1_sqsub_ins2_exp_h[] = {0x8000, 0x7fff, 0x7fff, 0x8000};
5202 unsigned ins1_sqsub_ins2_exp_s[] = {0x80000000, 0x7fffffff, 0x7fffffff, 0x80000000};
5203 uint64_t ins1_sqsub_ins2_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5204 0x7fffffffffffffff, 0x8000000000000000};
5205
5206 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sqsub_ins2_exp_b);
5207 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sqsub_ins2_exp_h);
5208 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sqsub_ins2_exp_s);
5209 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sqsub_ins2_exp_d);
5210
5211 unsigned ins2_sqsub_ins1_exp_b[] = {0x7f, 0x80, 0x80, 0x7f};
5212 unsigned ins2_sqsub_ins1_exp_h[] = {0x7fff, 0x8000, 0x8000, 0x7fff};
5213 unsigned ins2_sqsub_ins1_exp_s[] = {0x7fffffff, 0x80000000, 0x80000000, 0x7fffffff};
5214 uint64_t ins2_sqsub_ins1_exp_d[] = {0x7fffffffffffffff, 0x8000000000000000,
5215 0x8000000000000000, 0x7fffffffffffffff};
5216
5217 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sqsub_ins1_exp_b);
5218 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sqsub_ins1_exp_h);
5219 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sqsub_ins1_exp_s);
5220 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sqsub_ins1_exp_d);
5221
5222 fn = &MacroAssembler::Uqsub;
5223
5224 unsigned ins1_uqsub_ins2_exp_b[] = {0x71, 0x00, 0x00, 0x55};
5225 unsigned ins1_uqsub_ins2_exp_h[] = {0x7171, 0x0000, 0x0000, 0x5555};
5226 unsigned ins1_uqsub_ins2_exp_s[] = {0x70017171, 0x00000000, 0x00000000, 0x55555555};
5227 uint64_t ins1_uqsub_ins2_exp_d[] = {0x7000000170017171, 0x0000000000000000,
5228 0x0000000000000000, 0x5555555555555555};
5229
5230 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_uqsub_ins2_exp_b);
5231 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_uqsub_ins2_exp_h);
5232 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_uqsub_ins2_exp_s);
5233 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_uqsub_ins2_exp_d);
5234
5235 unsigned ins2_uqsub_ins1_exp_b[] = {0x00, 0x71, 0x72, 0x00};
5236 unsigned ins2_uqsub_ins1_exp_h[] = {0x0000, 0x7171, 0x7272, 0x0000};
5237 unsigned ins2_uqsub_ins1_exp_s[] = {0x00000000, 0x70017171, 0x71127272, 0x00000000};
5238 uint64_t ins2_uqsub_ins1_exp_d[] = {0x0000000000000000, 0x7000000170017171,
5239 0x7111111271127272, 0x0000000000000000};
5240
5241 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_uqsub_ins1_exp_b);
5242 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_uqsub_ins1_exp_h);
5243 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_uqsub_ins1_exp_s);
5244 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_uqsub_ins1_exp_d);
5245 // clang-format on
5246}
5247
Jacob Bramley9e5da2a2019-08-06 18:52:07 +01005248TEST_SVE(sve_rdvl) {
5249 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5250 START();
5251
5252 // Encodable multipliers.
5253 __ Rdvl(x0, 0);
5254 __ Rdvl(x1, 1);
5255 __ Rdvl(x2, 2);
5256 __ Rdvl(x3, 31);
5257 __ Rdvl(x4, -1);
5258 __ Rdvl(x5, -2);
5259 __ Rdvl(x6, -32);
5260
5261 // For unencodable multipliers, the MacroAssembler uses a sequence of
5262 // instructions.
5263 __ Rdvl(x10, 32);
5264 __ Rdvl(x11, -33);
5265 __ Rdvl(x12, 42);
5266 __ Rdvl(x13, -42);
5267
5268 // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5269 // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5270 // occurs in the macro.
5271 __ Rdvl(x14, 0x007fffffffffffff);
5272 __ Rdvl(x15, -0x0080000000000000);
5273
5274 END();
5275
5276 if (CAN_RUN()) {
5277 RUN();
5278
5279 uint64_t vl = config->sve_vl_in_bytes();
5280
5281 ASSERT_EQUAL_64(vl * 0, x0);
5282 ASSERT_EQUAL_64(vl * 1, x1);
5283 ASSERT_EQUAL_64(vl * 2, x2);
5284 ASSERT_EQUAL_64(vl * 31, x3);
5285 ASSERT_EQUAL_64(vl * -1, x4);
5286 ASSERT_EQUAL_64(vl * -2, x5);
5287 ASSERT_EQUAL_64(vl * -32, x6);
5288
5289 ASSERT_EQUAL_64(vl * 32, x10);
5290 ASSERT_EQUAL_64(vl * -33, x11);
5291 ASSERT_EQUAL_64(vl * 42, x12);
5292 ASSERT_EQUAL_64(vl * -42, x13);
5293
5294 ASSERT_EQUAL_64(vl * 0x007fffffffffffff, x14);
5295 ASSERT_EQUAL_64(vl * 0xff80000000000000, x15);
5296 }
5297}
5298
5299TEST_SVE(sve_rdpl) {
5300 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5301 START();
5302
5303 // There is no `rdpl` instruction, so the MacroAssembler maps `Rdpl` onto
5304 // Addpl(xd, xzr, ...).
5305
5306 // Encodable multipliers (as `addvl`).
5307 __ Rdpl(x0, 0);
5308 __ Rdpl(x1, 8);
5309 __ Rdpl(x2, 248);
5310 __ Rdpl(x3, -8);
5311 __ Rdpl(x4, -256);
5312
5313 // Encodable multipliers (as `movz` + `addpl`).
5314 __ Rdpl(x7, 31);
Jacob Bramley889984c2019-10-28 17:28:48 +00005315 __ Rdpl(x8, -31);
Jacob Bramley9e5da2a2019-08-06 18:52:07 +01005316
5317 // For unencodable multipliers, the MacroAssembler uses a sequence of
5318 // instructions.
5319 __ Rdpl(x10, 42);
5320 __ Rdpl(x11, -42);
5321
5322 // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5323 // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5324 // occurs in the macro.
5325 __ Rdpl(x12, 0x007fffffffffffff);
5326 __ Rdpl(x13, -0x0080000000000000);
5327
5328 END();
5329
5330 if (CAN_RUN()) {
5331 RUN();
5332
5333 uint64_t vl = config->sve_vl_in_bytes();
5334 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5335 uint64_t pl = vl / kZRegBitsPerPRegBit;
5336
5337 ASSERT_EQUAL_64(pl * 0, x0);
5338 ASSERT_EQUAL_64(pl * 8, x1);
5339 ASSERT_EQUAL_64(pl * 248, x2);
5340 ASSERT_EQUAL_64(pl * -8, x3);
5341 ASSERT_EQUAL_64(pl * -256, x4);
5342
5343 ASSERT_EQUAL_64(pl * 31, x7);
Jacob Bramley889984c2019-10-28 17:28:48 +00005344 ASSERT_EQUAL_64(pl * -31, x8);
Jacob Bramley9e5da2a2019-08-06 18:52:07 +01005345
5346 ASSERT_EQUAL_64(pl * 42, x10);
5347 ASSERT_EQUAL_64(pl * -42, x11);
5348
5349 ASSERT_EQUAL_64(pl * 0x007fffffffffffff, x12);
5350 ASSERT_EQUAL_64(pl * 0xff80000000000000, x13);
5351 }
5352}
5353
5354TEST_SVE(sve_addvl) {
5355 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5356 START();
5357
5358 uint64_t base = 0x1234567800000000;
5359 __ Mov(x30, base);
5360
5361 // Encodable multipliers.
5362 __ Addvl(x0, x30, 0);
5363 __ Addvl(x1, x30, 1);
5364 __ Addvl(x2, x30, 31);
5365 __ Addvl(x3, x30, -1);
5366 __ Addvl(x4, x30, -32);
5367
5368 // For unencodable multipliers, the MacroAssembler uses `Rdvl` and `Add`.
5369 __ Addvl(x5, x30, 32);
5370 __ Addvl(x6, x30, -33);
5371
5372 // Test the limits of the multiplier supported by the `Rdvl` macro.
5373 __ Addvl(x7, x30, 0x007fffffffffffff);
5374 __ Addvl(x8, x30, -0x0080000000000000);
5375
5376 // Check that xzr behaves correctly.
5377 __ Addvl(x9, xzr, 8);
5378 __ Addvl(x10, xzr, 42);
5379
5380 // Check that sp behaves correctly with encodable and unencodable multipliers.
5381 __ Addvl(sp, sp, -5);
5382 __ Addvl(sp, sp, -37);
5383 __ Addvl(x11, sp, -2);
5384 __ Addvl(sp, x11, 2);
5385 __ Addvl(x12, sp, -42);
5386
5387 // Restore the value of sp.
5388 __ Addvl(sp, x11, 39);
5389 __ Addvl(sp, sp, 5);
5390
5391 // Adjust x11 and x12 to make the test sp-agnostic.
5392 __ Sub(x11, sp, x11);
5393 __ Sub(x12, sp, x12);
5394
5395 // Check cases where xd.Is(xn). This stresses scratch register allocation.
5396 __ Mov(x20, x30);
5397 __ Mov(x21, x30);
5398 __ Mov(x22, x30);
5399 __ Addvl(x20, x20, 4);
5400 __ Addvl(x21, x21, 42);
5401 __ Addvl(x22, x22, -0x0080000000000000);
5402
5403 END();
5404
5405 if (CAN_RUN()) {
5406 RUN();
5407
5408 uint64_t vl = config->sve_vl_in_bytes();
5409
5410 ASSERT_EQUAL_64(base + (vl * 0), x0);
5411 ASSERT_EQUAL_64(base + (vl * 1), x1);
5412 ASSERT_EQUAL_64(base + (vl * 31), x2);
5413 ASSERT_EQUAL_64(base + (vl * -1), x3);
5414 ASSERT_EQUAL_64(base + (vl * -32), x4);
5415
5416 ASSERT_EQUAL_64(base + (vl * 32), x5);
5417 ASSERT_EQUAL_64(base + (vl * -33), x6);
5418
5419 ASSERT_EQUAL_64(base + (vl * 0x007fffffffffffff), x7);
5420 ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x8);
5421
5422 ASSERT_EQUAL_64(vl * 8, x9);
5423 ASSERT_EQUAL_64(vl * 42, x10);
5424
5425 ASSERT_EQUAL_64(vl * 44, x11);
5426 ASSERT_EQUAL_64(vl * 84, x12);
5427
5428 ASSERT_EQUAL_64(base + (vl * 4), x20);
5429 ASSERT_EQUAL_64(base + (vl * 42), x21);
5430 ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x22);
5431
5432 ASSERT_EQUAL_64(base, x30);
5433 }
5434}
5435
5436TEST_SVE(sve_addpl) {
5437 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5438 START();
5439
5440 uint64_t base = 0x1234567800000000;
5441 __ Mov(x30, base);
5442
5443 // Encodable multipliers.
5444 __ Addpl(x0, x30, 0);
5445 __ Addpl(x1, x30, 1);
5446 __ Addpl(x2, x30, 31);
5447 __ Addpl(x3, x30, -1);
5448 __ Addpl(x4, x30, -32);
5449
5450 // For unencodable multipliers, the MacroAssembler uses `Addvl` if it can, or
5451 // it falls back to `Rdvl` and `Add`.
5452 __ Addpl(x5, x30, 32);
5453 __ Addpl(x6, x30, -33);
5454
5455 // Test the limits of the multiplier supported by the `Rdvl` macro.
5456 __ Addpl(x7, x30, 0x007fffffffffffff);
5457 __ Addpl(x8, x30, -0x0080000000000000);
5458
5459 // Check that xzr behaves correctly.
5460 __ Addpl(x9, xzr, 8);
5461 __ Addpl(x10, xzr, 42);
5462
5463 // Check that sp behaves correctly with encodable and unencodable multipliers.
5464 __ Addpl(sp, sp, -5);
5465 __ Addpl(sp, sp, -37);
5466 __ Addpl(x11, sp, -2);
5467 __ Addpl(sp, x11, 2);
5468 __ Addpl(x12, sp, -42);
5469
5470 // Restore the value of sp.
5471 __ Addpl(sp, x11, 39);
5472 __ Addpl(sp, sp, 5);
5473
5474 // Adjust x11 and x12 to make the test sp-agnostic.
5475 __ Sub(x11, sp, x11);
5476 __ Sub(x12, sp, x12);
5477
5478 // Check cases where xd.Is(xn). This stresses scratch register allocation.
5479 __ Mov(x20, x30);
5480 __ Mov(x21, x30);
5481 __ Mov(x22, x30);
5482 __ Addpl(x20, x20, 4);
5483 __ Addpl(x21, x21, 42);
5484 __ Addpl(x22, x22, -0x0080000000000000);
5485
5486 END();
5487
5488 if (CAN_RUN()) {
5489 RUN();
5490
5491 uint64_t vl = config->sve_vl_in_bytes();
5492 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5493 uint64_t pl = vl / kZRegBitsPerPRegBit;
5494
5495 ASSERT_EQUAL_64(base + (pl * 0), x0);
5496 ASSERT_EQUAL_64(base + (pl * 1), x1);
5497 ASSERT_EQUAL_64(base + (pl * 31), x2);
5498 ASSERT_EQUAL_64(base + (pl * -1), x3);
5499 ASSERT_EQUAL_64(base + (pl * -32), x4);
5500
5501 ASSERT_EQUAL_64(base + (pl * 32), x5);
5502 ASSERT_EQUAL_64(base + (pl * -33), x6);
5503
5504 ASSERT_EQUAL_64(base + (pl * 0x007fffffffffffff), x7);
5505 ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x8);
5506
5507 ASSERT_EQUAL_64(pl * 8, x9);
5508 ASSERT_EQUAL_64(pl * 42, x10);
5509
5510 ASSERT_EQUAL_64(pl * 44, x11);
5511 ASSERT_EQUAL_64(pl * 84, x12);
5512
5513 ASSERT_EQUAL_64(base + (pl * 4), x20);
5514 ASSERT_EQUAL_64(base + (pl * 42), x21);
5515 ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x22);
5516
5517 ASSERT_EQUAL_64(base, x30);
5518 }
5519}
5520
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005521TEST_SVE(sve_calculate_sve_address) {
5522 // Shadow the `MacroAssembler` type so that the test macros work without
5523 // modification.
5524 typedef CalculateSVEAddressMacroAssembler MacroAssembler;
5525
Jacob Bramley1314c462019-08-08 10:54:16 +01005526 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005527 START(); // NOLINT(clang-diagnostic-local-type-template-args)
Jacob Bramley1314c462019-08-08 10:54:16 +01005528
5529 uint64_t base = 0x1234567800000000;
5530 __ Mov(x28, base);
5531 __ Mov(x29, 48);
5532 __ Mov(x30, -48);
5533
5534 // Simple scalar (or equivalent) cases.
5535
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005536 __ CalculateSVEAddress(x0, SVEMemOperand(x28));
5537 __ CalculateSVEAddress(x1, SVEMemOperand(x28, 0));
5538 __ CalculateSVEAddress(x2, SVEMemOperand(x28, 0, SVE_MUL_VL));
5539 __ CalculateSVEAddress(x3, SVEMemOperand(x28, 0, SVE_MUL_VL), 3);
5540 __ CalculateSVEAddress(x4, SVEMemOperand(x28, xzr));
5541 __ CalculateSVEAddress(x5, SVEMemOperand(x28, xzr, LSL, 42));
Jacob Bramley1314c462019-08-08 10:54:16 +01005542
5543 // scalar-plus-immediate
5544
5545 // Unscaled immediates, handled with `Add`.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005546 __ CalculateSVEAddress(x6, SVEMemOperand(x28, 42));
5547 __ CalculateSVEAddress(x7, SVEMemOperand(x28, -42));
Jacob Bramley1314c462019-08-08 10:54:16 +01005548 // Scaled immediates, handled with `Addvl` or `Addpl`.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005549 __ CalculateSVEAddress(x8, SVEMemOperand(x28, 31, SVE_MUL_VL), 0);
5550 __ CalculateSVEAddress(x9, SVEMemOperand(x28, -32, SVE_MUL_VL), 0);
Jacob Bramley1314c462019-08-08 10:54:16 +01005551 // Out of `addvl` or `addpl` range.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005552 __ CalculateSVEAddress(x10, SVEMemOperand(x28, 42, SVE_MUL_VL), 0);
5553 __ CalculateSVEAddress(x11, SVEMemOperand(x28, -42, SVE_MUL_VL), 0);
5554 // As above, for VL-based accesses smaller than a Z register.
5555 VIXL_STATIC_ASSERT(kZRegBitsPerPRegBitLog2 == 3);
5556 __ CalculateSVEAddress(x12, SVEMemOperand(x28, -32 * 8, SVE_MUL_VL), 3);
5557 __ CalculateSVEAddress(x13, SVEMemOperand(x28, -42 * 8, SVE_MUL_VL), 3);
5558 __ CalculateSVEAddress(x14, SVEMemOperand(x28, -32 * 4, SVE_MUL_VL), 2);
5559 __ CalculateSVEAddress(x15, SVEMemOperand(x28, -42 * 4, SVE_MUL_VL), 2);
5560 __ CalculateSVEAddress(x18, SVEMemOperand(x28, -32 * 2, SVE_MUL_VL), 1);
5561 __ CalculateSVEAddress(x19, SVEMemOperand(x28, -42 * 2, SVE_MUL_VL), 1);
Jacob Bramley1314c462019-08-08 10:54:16 +01005562
5563 // scalar-plus-scalar
5564
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005565 __ CalculateSVEAddress(x20, SVEMemOperand(x28, x29));
5566 __ CalculateSVEAddress(x21, SVEMemOperand(x28, x30));
5567 __ CalculateSVEAddress(x22, SVEMemOperand(x28, x29, LSL, 8));
5568 __ CalculateSVEAddress(x23, SVEMemOperand(x28, x30, LSL, 8));
Jacob Bramley1314c462019-08-08 10:54:16 +01005569
5570 // In-place updates, to stress scratch register allocation.
5571
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005572 __ Mov(x24, 0xabcd000000000000);
5573 __ Mov(x25, 0xabcd101100000000);
5574 __ Mov(x26, 0xabcd202200000000);
5575 __ Mov(x27, 0xabcd303300000000);
5576 __ Mov(x28, 0xabcd404400000000);
5577 __ Mov(x29, 0xabcd505500000000);
Jacob Bramley1314c462019-08-08 10:54:16 +01005578
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005579 __ CalculateSVEAddress(x24, SVEMemOperand(x24));
5580 __ CalculateSVEAddress(x25, SVEMemOperand(x25, 0x42));
5581 __ CalculateSVEAddress(x26, SVEMemOperand(x26, 3, SVE_MUL_VL), 0);
5582 __ CalculateSVEAddress(x27, SVEMemOperand(x27, 0x42, SVE_MUL_VL), 3);
5583 __ CalculateSVEAddress(x28, SVEMemOperand(x28, x30));
5584 __ CalculateSVEAddress(x29, SVEMemOperand(x29, x30, LSL, 4));
Jacob Bramley1314c462019-08-08 10:54:16 +01005585
5586 END();
5587
5588 if (CAN_RUN()) {
5589 RUN();
5590
5591 uint64_t vl = config->sve_vl_in_bytes();
5592 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5593 uint64_t pl = vl / kZRegBitsPerPRegBit;
5594
5595 // Simple scalar (or equivalent) cases.
5596 ASSERT_EQUAL_64(base, x0);
5597 ASSERT_EQUAL_64(base, x1);
5598 ASSERT_EQUAL_64(base, x2);
5599 ASSERT_EQUAL_64(base, x3);
5600 ASSERT_EQUAL_64(base, x4);
5601 ASSERT_EQUAL_64(base, x5);
5602
5603 // scalar-plus-immediate
5604 ASSERT_EQUAL_64(base + 42, x6);
5605 ASSERT_EQUAL_64(base - 42, x7);
5606 ASSERT_EQUAL_64(base + (31 * vl), x8);
5607 ASSERT_EQUAL_64(base - (32 * vl), x9);
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005608 ASSERT_EQUAL_64(base + (42 * vl), x10);
5609 ASSERT_EQUAL_64(base - (42 * vl), x11);
5610 ASSERT_EQUAL_64(base - (32 * vl), x12);
Jacob Bramley1314c462019-08-08 10:54:16 +01005611 ASSERT_EQUAL_64(base - (42 * vl), x13);
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005612 ASSERT_EQUAL_64(base - (32 * vl), x14);
5613 ASSERT_EQUAL_64(base - (42 * vl), x15);
5614 ASSERT_EQUAL_64(base - (32 * vl), x18);
5615 ASSERT_EQUAL_64(base - (42 * vl), x19);
Jacob Bramley1314c462019-08-08 10:54:16 +01005616
5617 // scalar-plus-scalar
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005618 ASSERT_EQUAL_64(base + 48, x20);
5619 ASSERT_EQUAL_64(base - 48, x21);
5620 ASSERT_EQUAL_64(base + (48 << 8), x22);
5621 ASSERT_EQUAL_64(base - (48 << 8), x23);
Jacob Bramley1314c462019-08-08 10:54:16 +01005622
5623 // In-place updates.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005624 ASSERT_EQUAL_64(0xabcd000000000000, x24);
5625 ASSERT_EQUAL_64(0xabcd101100000000 + 0x42, x25);
5626 ASSERT_EQUAL_64(0xabcd202200000000 + (3 * vl), x26);
5627 ASSERT_EQUAL_64(0xabcd303300000000 + (0x42 * pl), x27);
5628 ASSERT_EQUAL_64(0xabcd404400000000 - 48, x28);
5629 ASSERT_EQUAL_64(0xabcd505500000000 - (48 << 4), x29);
Jacob Bramley1314c462019-08-08 10:54:16 +01005630 }
5631}
5632
TatWai Chong4f28df72019-08-14 17:50:30 -07005633TEST_SVE(sve_permute_vector_unpredicated) {
5634 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
5635 START();
5636
Jacob Bramleye4983d42019-10-08 10:56:15 +01005637 // Initialise registers with known values first.
5638 __ Dup(z1.VnB(), 0x11);
5639 __ Dup(z2.VnB(), 0x22);
5640 __ Dup(z3.VnB(), 0x33);
5641 __ Dup(z4.VnB(), 0x44);
5642
TatWai Chong4f28df72019-08-14 17:50:30 -07005643 __ Mov(x0, 0x0123456789abcdef);
5644 __ Fmov(d0, RawbitsToDouble(0x7ffaaaaa22223456));
5645 __ Insr(z1.VnS(), w0);
5646 __ Insr(z2.VnD(), x0);
5647 __ Insr(z3.VnH(), h0);
5648 __ Insr(z4.VnD(), d0);
5649
5650 uint64_t inputs[] = {0xfedcba9876543210,
5651 0x0123456789abcdef,
5652 0x8f8e8d8c8b8a8988,
5653 0x8786858483828180};
5654
5655 // Initialize a distinguishable value throughout the register first.
5656 __ Dup(z9.VnB(), 0xff);
5657 InsrHelper(&masm, z9.VnD(), inputs);
5658
5659 __ Rev(z5.VnB(), z9.VnB());
5660 __ Rev(z6.VnH(), z9.VnH());
5661 __ Rev(z7.VnS(), z9.VnS());
5662 __ Rev(z8.VnD(), z9.VnD());
5663
5664 int index[7] = {22, 7, 7, 3, 1, 1, 63};
5665 // Broadcasting an data within the input array.
5666 __ Dup(z10.VnB(), z9.VnB(), index[0]);
5667 __ Dup(z11.VnH(), z9.VnH(), index[1]);
5668 __ Dup(z12.VnS(), z9.VnS(), index[2]);
5669 __ Dup(z13.VnD(), z9.VnD(), index[3]);
5670 __ Dup(z14.VnQ(), z9.VnQ(), index[4]);
5671 // Test dst == src
5672 __ Mov(z15, z9);
5673 __ Dup(z15.VnS(), z15.VnS(), index[5]);
5674 // Selecting an data beyond the input array.
5675 __ Dup(z16.VnB(), z9.VnB(), index[6]);
5676
5677 END();
5678
5679 if (CAN_RUN()) {
5680 RUN();
5681
5682 // Insr
Jacob Bramleye4983d42019-10-08 10:56:15 +01005683 uint64_t z1_expected[] = {0x1111111111111111, 0x1111111189abcdef};
5684 uint64_t z2_expected[] = {0x2222222222222222, 0x0123456789abcdef};
5685 uint64_t z3_expected[] = {0x3333333333333333, 0x3333333333333456};
5686 uint64_t z4_expected[] = {0x4444444444444444, 0x7ffaaaaa22223456};
TatWai Chong4f28df72019-08-14 17:50:30 -07005687 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
5688 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
5689 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
5690 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
5691
5692 // Rev
5693 int lane_count = core.GetSVELaneCount(kBRegSize);
5694 for (int i = 0; i < lane_count; i++) {
5695 uint64_t expected =
5696 core.zreg_lane(z5.GetCode(), kBRegSize, lane_count - i - 1);
5697 uint64_t input = core.zreg_lane(z9.GetCode(), kBRegSize, i);
5698 ASSERT_EQUAL_64(expected, input);
5699 }
5700
5701 lane_count = core.GetSVELaneCount(kHRegSize);
5702 for (int i = 0; i < lane_count; i++) {
5703 uint64_t expected =
5704 core.zreg_lane(z6.GetCode(), kHRegSize, lane_count - i - 1);
5705 uint64_t input = core.zreg_lane(z9.GetCode(), kHRegSize, i);
5706 ASSERT_EQUAL_64(expected, input);
5707 }
5708
5709 lane_count = core.GetSVELaneCount(kSRegSize);
5710 for (int i = 0; i < lane_count; i++) {
5711 uint64_t expected =
5712 core.zreg_lane(z7.GetCode(), kSRegSize, lane_count - i - 1);
5713 uint64_t input = core.zreg_lane(z9.GetCode(), kSRegSize, i);
5714 ASSERT_EQUAL_64(expected, input);
5715 }
5716
5717 lane_count = core.GetSVELaneCount(kDRegSize);
5718 for (int i = 0; i < lane_count; i++) {
5719 uint64_t expected =
5720 core.zreg_lane(z8.GetCode(), kDRegSize, lane_count - i - 1);
5721 uint64_t input = core.zreg_lane(z9.GetCode(), kDRegSize, i);
5722 ASSERT_EQUAL_64(expected, input);
5723 }
5724
5725 // Dup
5726 unsigned vl = config->sve_vl_in_bits();
5727 lane_count = core.GetSVELaneCount(kBRegSize);
5728 uint64_t expected_z10 = (vl > (index[0] * kBRegSize)) ? 0x23 : 0;
5729 for (int i = 0; i < lane_count; i++) {
5730 ASSERT_EQUAL_SVE_LANE(expected_z10, z10.VnB(), i);
5731 }
5732
5733 lane_count = core.GetSVELaneCount(kHRegSize);
5734 uint64_t expected_z11 = (vl > (index[1] * kHRegSize)) ? 0x8f8e : 0;
5735 for (int i = 0; i < lane_count; i++) {
5736 ASSERT_EQUAL_SVE_LANE(expected_z11, z11.VnH(), i);
5737 }
5738
5739 lane_count = core.GetSVELaneCount(kSRegSize);
5740 uint64_t expected_z12 = (vl > (index[2] * kSRegSize)) ? 0xfedcba98 : 0;
5741 for (int i = 0; i < lane_count; i++) {
5742 ASSERT_EQUAL_SVE_LANE(expected_z12, z12.VnS(), i);
5743 }
5744
5745 lane_count = core.GetSVELaneCount(kDRegSize);
5746 uint64_t expected_z13 =
5747 (vl > (index[3] * kDRegSize)) ? 0xfedcba9876543210 : 0;
5748 for (int i = 0; i < lane_count; i++) {
5749 ASSERT_EQUAL_SVE_LANE(expected_z13, z13.VnD(), i);
5750 }
5751
5752 lane_count = core.GetSVELaneCount(kDRegSize);
5753 uint64_t expected_z14_lo = 0;
5754 uint64_t expected_z14_hi = 0;
5755 if (vl > (index[4] * kQRegSize)) {
5756 expected_z14_lo = 0x0123456789abcdef;
5757 expected_z14_hi = 0xfedcba9876543210;
5758 }
5759 for (int i = 0; i < lane_count; i += 2) {
5760 ASSERT_EQUAL_SVE_LANE(expected_z14_lo, z14.VnD(), i);
5761 ASSERT_EQUAL_SVE_LANE(expected_z14_hi, z14.VnD(), i + 1);
5762 }
5763
5764 lane_count = core.GetSVELaneCount(kSRegSize);
5765 uint64_t expected_z15 = (vl > (index[5] * kSRegSize)) ? 0x87868584 : 0;
5766 for (int i = 0; i < lane_count; i++) {
5767 ASSERT_EQUAL_SVE_LANE(expected_z15, z15.VnS(), i);
5768 }
5769
5770 lane_count = core.GetSVELaneCount(kBRegSize);
5771 uint64_t expected_z16 = (vl > (index[6] * kBRegSize)) ? 0xff : 0;
5772 for (int i = 0; i < lane_count; i++) {
5773 ASSERT_EQUAL_SVE_LANE(expected_z16, z16.VnB(), i);
5774 }
5775 }
5776}
5777
Martyn Capewell2e954292020-01-14 14:56:42 +00005778TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements) {
TatWai Chong4f28df72019-08-14 17:50:30 -07005779 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5780 START();
5781
5782 uint64_t z9_inputs[] = {0xfedcba9876543210,
5783 0x0123456789abcdef,
5784 0x8f8e8d8c8b8a8988,
5785 0x8786858483828180};
5786 InsrHelper(&masm, z9.VnD(), z9_inputs);
5787
5788 __ Sunpkhi(z10.VnH(), z9.VnB());
5789 __ Sunpkhi(z11.VnS(), z9.VnH());
5790 __ Sunpkhi(z12.VnD(), z9.VnS());
5791
5792 __ Sunpklo(z13.VnH(), z9.VnB());
5793 __ Sunpklo(z14.VnS(), z9.VnH());
5794 __ Sunpklo(z15.VnD(), z9.VnS());
5795
5796 __ Uunpkhi(z16.VnH(), z9.VnB());
5797 __ Uunpkhi(z17.VnS(), z9.VnH());
5798 __ Uunpkhi(z18.VnD(), z9.VnS());
5799
5800 __ Uunpklo(z19.VnH(), z9.VnB());
5801 __ Uunpklo(z20.VnS(), z9.VnH());
5802 __ Uunpklo(z21.VnD(), z9.VnS());
5803
Martyn Capewell2e954292020-01-14 14:56:42 +00005804 // Test unpacking with same source and destination.
5805 __ Mov(z22, z9);
5806 __ Sunpklo(z22.VnH(), z22.VnB());
5807 __ Mov(z23, z9);
5808 __ Uunpklo(z23.VnH(), z23.VnB());
5809
TatWai Chong4f28df72019-08-14 17:50:30 -07005810 END();
5811
5812 if (CAN_RUN()) {
5813 RUN();
5814
5815 // Suunpkhi
5816 int lane_count = core.GetSVELaneCount(kHRegSize);
5817 for (int i = lane_count - 1; i >= 0; i--) {
5818 uint16_t expected = core.zreg_lane<uint16_t>(z10.GetCode(), i);
5819 uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5820 uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5821 ASSERT_EQUAL_64(expected, input);
5822 }
5823
5824 lane_count = core.GetSVELaneCount(kSRegSize);
5825 for (int i = lane_count - 1; i >= 0; i--) {
5826 uint32_t expected = core.zreg_lane<uint32_t>(z11.GetCode(), i);
5827 uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5828 uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5829 ASSERT_EQUAL_64(expected, input);
5830 }
5831
5832 lane_count = core.GetSVELaneCount(kDRegSize);
5833 for (int i = lane_count - 1; i >= 0; i--) {
5834 uint64_t expected = core.zreg_lane<uint64_t>(z12.GetCode(), i);
5835 uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5836 uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5837 ASSERT_EQUAL_64(expected, input);
5838 }
5839
5840 // Suunpklo
5841 lane_count = core.GetSVELaneCount(kHRegSize);
5842 for (int i = lane_count - 1; i >= 0; i--) {
5843 uint16_t expected = core.zreg_lane<uint16_t>(z13.GetCode(), i);
5844 uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5845 uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5846 ASSERT_EQUAL_64(expected, input);
5847 }
5848
5849 lane_count = core.GetSVELaneCount(kSRegSize);
5850 for (int i = lane_count - 1; i >= 0; i--) {
5851 uint32_t expected = core.zreg_lane<uint32_t>(z14.GetCode(), i);
5852 uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5853 uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5854 ASSERT_EQUAL_64(expected, input);
5855 }
5856
5857 lane_count = core.GetSVELaneCount(kDRegSize);
5858 for (int i = lane_count - 1; i >= 0; i--) {
5859 uint64_t expected = core.zreg_lane<uint64_t>(z15.GetCode(), i);
5860 uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5861 uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5862 ASSERT_EQUAL_64(expected, input);
5863 }
5864
5865 // Uuunpkhi
5866 lane_count = core.GetSVELaneCount(kHRegSize);
5867 for (int i = lane_count - 1; i >= 0; i--) {
5868 uint16_t expected = core.zreg_lane<uint16_t>(z16.GetCode(), i);
5869 uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5870 ASSERT_EQUAL_64(expected, input);
5871 }
5872
5873 lane_count = core.GetSVELaneCount(kSRegSize);
5874 for (int i = lane_count - 1; i >= 0; i--) {
5875 uint32_t expected = core.zreg_lane<uint32_t>(z17.GetCode(), i);
5876 uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5877 ASSERT_EQUAL_64(expected, input);
5878 }
5879
5880 lane_count = core.GetSVELaneCount(kDRegSize);
5881 for (int i = lane_count - 1; i >= 0; i--) {
5882 uint64_t expected = core.zreg_lane<uint64_t>(z18.GetCode(), i);
5883 uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5884 ASSERT_EQUAL_64(expected, input);
5885 }
5886
5887 // Uuunpklo
5888 lane_count = core.GetSVELaneCount(kHRegSize);
5889 for (int i = lane_count - 1; i >= 0; i--) {
5890 uint16_t expected = core.zreg_lane<uint16_t>(z19.GetCode(), i);
5891 uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5892 ASSERT_EQUAL_64(expected, input);
5893 }
5894
5895 lane_count = core.GetSVELaneCount(kSRegSize);
5896 for (int i = lane_count - 1; i >= 0; i--) {
5897 uint32_t expected = core.zreg_lane<uint32_t>(z20.GetCode(), i);
5898 uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5899 ASSERT_EQUAL_64(expected, input);
5900 }
5901
5902 lane_count = core.GetSVELaneCount(kDRegSize);
5903 for (int i = lane_count - 1; i >= 0; i--) {
5904 uint64_t expected = core.zreg_lane<uint64_t>(z21.GetCode(), i);
5905 uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5906 ASSERT_EQUAL_64(expected, input);
5907 }
Martyn Capewell2e954292020-01-14 14:56:42 +00005908
5909 ASSERT_EQUAL_SVE(z13, z22);
5910 ASSERT_EQUAL_SVE(z19, z23);
TatWai Chong4f28df72019-08-14 17:50:30 -07005911 }
5912}
5913
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01005914TEST_SVE(sve_cnot_not) {
5915 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5916 START();
5917
5918 uint64_t in[] = {0x0000000000000000, 0x00000000e1c30000, 0x123456789abcdef0};
5919
5920 // For simplicity, we re-use the same pg for various lane sizes.
5921 // For D lanes: 1, 1, 0
5922 // For S lanes: 1, 1, 1, 0, 0
5923 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
5924 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
5925 Initialise(&masm, p0.VnB(), pg_in);
5926 PRegisterM pg = p0.Merging();
5927
5928 // These are merging operations, so we have to initialise the result register.
5929 // We use a mixture of constructive and destructive operations.
5930
5931 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01005932 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01005933 __ Mov(z30, z31);
5934
5935 // For constructive operations, use a different initial result value.
5936 __ Index(z29.VnB(), 0, -1);
5937
5938 __ Mov(z0, z31);
5939 __ Cnot(z0.VnB(), pg, z0.VnB()); // destructive
5940 __ Mov(z1, z29);
5941 __ Cnot(z1.VnH(), pg, z31.VnH());
5942 __ Mov(z2, z31);
5943 __ Cnot(z2.VnS(), pg, z2.VnS()); // destructive
5944 __ Mov(z3, z29);
5945 __ Cnot(z3.VnD(), pg, z31.VnD());
5946
5947 __ Mov(z4, z29);
5948 __ Not(z4.VnB(), pg, z31.VnB());
5949 __ Mov(z5, z31);
5950 __ Not(z5.VnH(), pg, z5.VnH()); // destructive
5951 __ Mov(z6, z29);
5952 __ Not(z6.VnS(), pg, z31.VnS());
5953 __ Mov(z7, z31);
5954 __ Not(z7.VnD(), pg, z7.VnD()); // destructive
5955
5956 END();
5957
5958 if (CAN_RUN()) {
5959 RUN();
5960
5961 // Check that constructive operations preserve their inputs.
5962 ASSERT_EQUAL_SVE(z30, z31);
5963
5964 // clang-format off
5965
5966 // Cnot (B) destructive
5967 uint64_t expected_z0[] =
5968 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
5969 {0x0000000001000101, 0x01000001e1000101, 0x12340078000000f0};
5970 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
5971
5972 // Cnot (H)
5973 uint64_t expected_z1[] =
5974 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
5975 {0xe9eaebecedee0001, 0xf1f2000100000001, 0xf9fafbfc0000ff00};
5976 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
5977
5978 // Cnot (S) destructive
5979 uint64_t expected_z2[] =
5980 // pg: 0 1 1 1 0 0
5981 {0x0000000000000001, 0x0000000100000000, 0x123456789abcdef0};
5982 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
5983
5984 // Cnot (D)
5985 uint64_t expected_z3[] =
5986 // pg: 1 1 0
5987 {0x0000000000000001, 0x0000000000000000, 0xf9fafbfcfdfeff00};
5988 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
5989
5990 // Not (B)
5991 uint64_t expected_z4[] =
5992 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
5993 {0xe9eaebecffeeffff, 0xfff2f3fff53cffff, 0xf9faa9fc65432100};
5994 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
5995
5996 // Not (H) destructive
5997 uint64_t expected_z5[] =
5998 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
5999 {0x000000000000ffff, 0x0000ffff1e3cffff, 0x123456786543def0};
6000 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6001
6002 // Not (S)
6003 uint64_t expected_z6[] =
6004 // pg: 0 1 1 1 0 0
6005 {0xe9eaebecffffffff, 0xffffffff1e3cffff, 0xf9fafbfcfdfeff00};
6006 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6007
6008 // Not (D) destructive
6009 uint64_t expected_z7[] =
6010 // pg: 1 1 0
6011 {0xffffffffffffffff, 0xffffffff1e3cffff, 0x123456789abcdef0};
6012 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6013
6014 // clang-format on
6015 }
6016}
6017
6018TEST_SVE(sve_fabs_fneg) {
6019 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6020 START();
6021
6022 // Include FP64, FP32 and FP16 signalling NaNs. Most FP operations quieten
6023 // NaNs, but fabs and fneg do not.
6024 uint64_t in[] = {0xc04500004228d140, // Recognisable (+/-42) values.
6025 0xfff00000ff80fc01, // Signalling NaNs.
6026 0x123456789abcdef0};
6027
6028 // For simplicity, we re-use the same pg for various lane sizes.
6029 // For D lanes: 1, 1, 0
6030 // For S lanes: 1, 1, 1, 0, 0
6031 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6032 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6033 Initialise(&masm, p0.VnB(), pg_in);
6034 PRegisterM pg = p0.Merging();
6035
6036 // These are merging operations, so we have to initialise the result register.
6037 // We use a mixture of constructive and destructive operations.
6038
6039 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006040 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006041 __ Mov(z30, z31);
6042
6043 // For constructive operations, use a different initial result value.
6044 __ Index(z29.VnB(), 0, -1);
6045
6046 __ Mov(z0, z29);
6047 __ Fabs(z0.VnH(), pg, z31.VnH());
6048 __ Mov(z1, z31);
6049 __ Fabs(z1.VnS(), pg, z1.VnS()); // destructive
6050 __ Mov(z2, z29);
6051 __ Fabs(z2.VnD(), pg, z31.VnD());
6052
6053 __ Mov(z3, z31);
6054 __ Fneg(z3.VnH(), pg, z3.VnH()); // destructive
6055 __ Mov(z4, z29);
6056 __ Fneg(z4.VnS(), pg, z31.VnS());
6057 __ Mov(z5, z31);
6058 __ Fneg(z5.VnD(), pg, z5.VnD()); // destructive
6059
6060 END();
6061
6062 if (CAN_RUN()) {
6063 RUN();
6064
6065 // Check that constructive operations preserve their inputs.
6066 ASSERT_EQUAL_SVE(z30, z31);
6067
6068 // clang-format off
6069
6070 // Fabs (H)
6071 uint64_t expected_z0[] =
6072 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6073 {0xe9eaebecedee5140, 0xf1f200007f807c01, 0xf9fafbfc1abcff00};
6074 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6075
6076 // Fabs (S) destructive
6077 uint64_t expected_z1[] =
6078 // pg: 0 1 1 1 0 0
6079 {0xc04500004228d140, 0x7ff000007f80fc01, 0x123456789abcdef0};
6080 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6081
6082 // Fabs (D)
6083 uint64_t expected_z2[] =
6084 // pg: 1 1 0
6085 {0x404500004228d140, 0x7ff00000ff80fc01, 0xf9fafbfcfdfeff00};
6086 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6087
6088 // Fneg (H) destructive
6089 uint64_t expected_z3[] =
6090 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6091 {0xc045000042285140, 0xfff080007f807c01, 0x123456781abcdef0};
6092 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6093
6094 // Fneg (S)
6095 uint64_t expected_z4[] =
6096 // pg: 0 1 1 1 0 0
6097 {0xe9eaebecc228d140, 0x7ff000007f80fc01, 0xf9fafbfcfdfeff00};
6098 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6099
6100 // Fneg (D) destructive
6101 uint64_t expected_z5[] =
6102 // pg: 1 1 0
6103 {0x404500004228d140, 0x7ff00000ff80fc01, 0x123456789abcdef0};
6104 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6105
6106 // clang-format on
6107 }
6108}
6109
6110TEST_SVE(sve_cls_clz_cnt) {
6111 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6112 START();
6113
6114 uint64_t in[] = {0x0000000000000000, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6115
6116 // For simplicity, we re-use the same pg for various lane sizes.
6117 // For D lanes: 1, 1, 0
6118 // For S lanes: 1, 1, 1, 0, 0
6119 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6120 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6121 Initialise(&masm, p0.VnB(), pg_in);
6122 PRegisterM pg = p0.Merging();
6123
6124 // These are merging operations, so we have to initialise the result register.
6125 // We use a mixture of constructive and destructive operations.
6126
6127 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006128 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006129 __ Mov(z30, z31);
6130
6131 // For constructive operations, use a different initial result value.
6132 __ Index(z29.VnB(), 0, -1);
6133
6134 __ Mov(z0, z29);
6135 __ Cls(z0.VnB(), pg, z31.VnB());
6136 __ Mov(z1, z31);
6137 __ Clz(z1.VnH(), pg, z1.VnH()); // destructive
6138 __ Mov(z2, z29);
6139 __ Cnt(z2.VnS(), pg, z31.VnS());
6140 __ Mov(z3, z31);
6141 __ Cnt(z3.VnD(), pg, z3.VnD()); // destructive
6142
6143 END();
6144
6145 if (CAN_RUN()) {
6146 RUN();
6147 // Check that non-destructive operations preserve their inputs.
6148 ASSERT_EQUAL_SVE(z30, z31);
6149
6150 // clang-format off
6151
6152 // cls (B)
6153 uint8_t expected_z0[] =
6154 // pg: 0 0 0 0 1 0 1 1
6155 // pg: 1 0 0 1 0 1 1 1
6156 // pg: 0 0 1 0 1 1 1 0
6157 {0xe9, 0xea, 0xeb, 0xec, 7, 0xee, 7, 7,
6158 6, 0xf2, 0xf3, 3, 0xf5, 1, 0, 3,
6159 0xf9, 0xfa, 0, 0xfc, 0, 0, 1, 0x00};
6160 ASSERT_EQUAL_SVE(expected_z0, z0.VnB());
6161
6162 // clz (H) destructive
6163 uint16_t expected_z1[] =
6164 // pg: 0 0 0 1
6165 // pg: 0 1 1 1
6166 // pg: 0 0 1 0
6167 {0x0000, 0x0000, 0x0000, 16,
6168 0xfefc, 0, 0, 0,
6169 0x1234, 0x5678, 0, 0xdef0};
6170 ASSERT_EQUAL_SVE(expected_z1, z1.VnH());
6171
6172 // cnt (S)
6173 uint32_t expected_z2[] =
6174 // pg: 0 1
6175 // pg: 1 1
6176 // pg: 0 0
6177 {0xe9eaebec, 0,
6178 22, 16,
6179 0xf9fafbfc, 0xfdfeff00};
6180 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
6181
6182 // cnt (D) destructive
6183 uint64_t expected_z3[] =
6184 // pg: 1 1 0
6185 { 0, 38, 0x123456789abcdef0};
6186 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6187
6188 // clang-format on
6189 }
6190}
6191
6192TEST_SVE(sve_sxt) {
6193 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6194 START();
6195
6196 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6197
6198 // For simplicity, we re-use the same pg for various lane sizes.
6199 // For D lanes: 1, 1, 0
6200 // For S lanes: 1, 1, 1, 0, 0
6201 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6202 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6203 Initialise(&masm, p0.VnB(), pg_in);
6204 PRegisterM pg = p0.Merging();
6205
6206 // These are merging operations, so we have to initialise the result register.
6207 // We use a mixture of constructive and destructive operations.
6208
6209 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006210 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006211 __ Mov(z30, z31);
6212
6213 // For constructive operations, use a different initial result value.
6214 __ Index(z29.VnB(), 0, -1);
6215
6216 __ Mov(z0, z31);
6217 __ Sxtb(z0.VnH(), pg, z0.VnH()); // destructive
6218 __ Mov(z1, z29);
6219 __ Sxtb(z1.VnS(), pg, z31.VnS());
6220 __ Mov(z2, z31);
6221 __ Sxtb(z2.VnD(), pg, z2.VnD()); // destructive
6222 __ Mov(z3, z29);
6223 __ Sxth(z3.VnS(), pg, z31.VnS());
6224 __ Mov(z4, z31);
6225 __ Sxth(z4.VnD(), pg, z4.VnD()); // destructive
6226 __ Mov(z5, z29);
6227 __ Sxtw(z5.VnD(), pg, z31.VnD());
6228
6229 END();
6230
6231 if (CAN_RUN()) {
6232 RUN();
6233 // Check that constructive operations preserve their inputs.
6234 ASSERT_EQUAL_SVE(z30, z31);
6235
6236 // clang-format off
6237
6238 // Sxtb (H) destructive
6239 uint64_t expected_z0[] =
6240 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6241 {0x01f203f405f6fff8, 0xfefcfff0ffc3000f, 0x12345678ffbcdef0};
6242 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6243
6244 // Sxtb (S)
6245 uint64_t expected_z1[] =
6246 // pg: 0 1 1 1 0 0
6247 {0xe9eaebecfffffff8, 0xfffffff00000000f, 0xf9fafbfcfdfeff00};
6248 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6249
6250 // Sxtb (D) destructive
6251 uint64_t expected_z2[] =
6252 // pg: 1 1 0
6253 {0xfffffffffffffff8, 0x000000000000000f, 0x123456789abcdef0};
6254 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6255
6256 // Sxth (S)
6257 uint64_t expected_z3[] =
6258 // pg: 0 1 1 1 0 0
6259 {0xe9eaebec000007f8, 0xfffff8f0ffff870f, 0xf9fafbfcfdfeff00};
6260 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6261
6262 // Sxth (D) destructive
6263 uint64_t expected_z4[] =
6264 // pg: 1 1 0
6265 {0x00000000000007f8, 0xffffffffffff870f, 0x123456789abcdef0};
6266 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6267
6268 // Sxtw (D)
6269 uint64_t expected_z5[] =
6270 // pg: 1 1 0
6271 {0x0000000005f607f8, 0xffffffffe1c3870f, 0xf9fafbfcfdfeff00};
6272 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6273
6274 // clang-format on
6275 }
6276}
6277
6278TEST_SVE(sve_uxt) {
6279 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6280 START();
6281
6282 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6283
6284 // For simplicity, we re-use the same pg for various lane sizes.
6285 // For D lanes: 1, 1, 0
6286 // For S lanes: 1, 1, 1, 0, 0
6287 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6288 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6289 Initialise(&masm, p0.VnB(), pg_in);
6290 PRegisterM pg = p0.Merging();
6291
6292 // These are merging operations, so we have to initialise the result register.
6293 // We use a mixture of constructive and destructive operations.
6294
6295 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006296 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006297 __ Mov(z30, z31);
6298
6299 // For constructive operations, use a different initial result value.
6300 __ Index(z29.VnB(), 0, -1);
6301
6302 __ Mov(z0, z29);
6303 __ Uxtb(z0.VnH(), pg, z31.VnH());
6304 __ Mov(z1, z31);
6305 __ Uxtb(z1.VnS(), pg, z1.VnS()); // destructive
6306 __ Mov(z2, z29);
6307 __ Uxtb(z2.VnD(), pg, z31.VnD());
6308 __ Mov(z3, z31);
6309 __ Uxth(z3.VnS(), pg, z3.VnS()); // destructive
6310 __ Mov(z4, z29);
6311 __ Uxth(z4.VnD(), pg, z31.VnD());
6312 __ Mov(z5, z31);
6313 __ Uxtw(z5.VnD(), pg, z5.VnD()); // destructive
6314
6315 END();
6316
6317 if (CAN_RUN()) {
6318 RUN();
6319 // clang-format off
6320
6321 // Uxtb (H)
6322 uint64_t expected_z0[] =
6323 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6324 {0xe9eaebecedee00f8, 0xf1f200f000c3000f, 0xf9fafbfc00bcff00};
6325 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6326
6327 // Uxtb (S) destructive
6328 uint64_t expected_z1[] =
6329 // pg: 0 1 1 1 0 0
6330 {0x01f203f4000000f8, 0x000000f00000000f, 0x123456789abcdef0};
6331 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6332
6333 // Uxtb (D)
6334 uint64_t expected_z2[] =
6335 // pg: 1 1 0
6336 {0x00000000000000f8, 0x000000000000000f, 0xf9fafbfcfdfeff00};
6337 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6338
6339 // Uxth (S) destructive
6340 uint64_t expected_z3[] =
6341 // pg: 0 1 1 1 0 0
6342 {0x01f203f4000007f8, 0x0000f8f00000870f, 0x123456789abcdef0};
6343 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6344
6345 // Uxth (D)
6346 uint64_t expected_z4[] =
6347 // pg: 1 1 0
6348 {0x00000000000007f8, 0x000000000000870f, 0xf9fafbfcfdfeff00};
6349 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6350
6351 // Uxtw (D) destructive
6352 uint64_t expected_z5[] =
6353 // pg: 1 1 0
6354 {0x0000000005f607f8, 0x00000000e1c3870f, 0x123456789abcdef0};
6355 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6356
6357 // clang-format on
6358 }
6359}
6360
6361TEST_SVE(sve_abs_neg) {
6362 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6363 START();
6364
6365 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6366
6367 // For simplicity, we re-use the same pg for various lane sizes.
6368 // For D lanes: 1, 1, 0
6369 // For S lanes: 1, 1, 1, 0, 0
6370 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6371 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6372 Initialise(&masm, p0.VnB(), pg_in);
6373 PRegisterM pg = p0.Merging();
6374
6375 InsrHelper(&masm, z31.VnD(), in);
6376
6377 // These are merging operations, so we have to initialise the result register.
6378 // We use a mixture of constructive and destructive operations.
6379
6380 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006381 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006382 __ Mov(z30, z31);
6383
6384 // For constructive operations, use a different initial result value.
6385 __ Index(z29.VnB(), 0, -1);
6386
6387 __ Mov(z0, z31);
6388 __ Abs(z0.VnD(), pg, z0.VnD()); // destructive
6389 __ Mov(z1, z29);
6390 __ Abs(z1.VnB(), pg, z31.VnB());
6391
6392 __ Mov(z2, z31);
6393 __ Neg(z2.VnH(), pg, z2.VnH()); // destructive
6394 __ Mov(z3, z29);
6395 __ Neg(z3.VnS(), pg, z31.VnS());
6396
Jacob Bramleyc0066272019-09-30 16:30:47 +01006397 // The unpredicated form of `Neg` is implemented using `subr`.
6398 __ Mov(z4, z31);
6399 __ Neg(z4.VnB(), z4.VnB()); // destructive
6400 __ Mov(z5, z29);
6401 __ Neg(z5.VnD(), z31.VnD());
6402
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006403 END();
6404
6405 if (CAN_RUN()) {
6406 RUN();
Jacob Bramleyc0066272019-09-30 16:30:47 +01006407
6408 ASSERT_EQUAL_SVE(z30, z31);
6409
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006410 // clang-format off
6411
6412 // Abs (D) destructive
6413 uint64_t expected_z0[] =
6414 // pg: 1 1 0
6415 {0x01f203f405f607f8, 0x0103070f1e3c78f1, 0x123456789abcdef0};
6416 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6417
6418 // Abs (B)
6419 uint64_t expected_z1[] =
6420 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
6421 {0xe9eaebec05ee0708, 0x02f2f310f53d790f, 0xf9fa56fc66442200};
6422 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6423
6424 // Neg (H) destructive
6425 uint64_t expected_z2[] =
6426 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6427 {0x01f203f405f6f808, 0xfefc07101e3d78f1, 0x123456786544def0};
6428 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6429
6430 // Neg (S)
6431 uint64_t expected_z3[] =
6432 // pg: 0 1 1 1 0 0
6433 {0xe9eaebecfa09f808, 0x010307101e3c78f1, 0xf9fafbfcfdfeff00};
6434 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6435
Jacob Bramleyc0066272019-09-30 16:30:47 +01006436 // Neg (B) destructive, unpredicated
6437 uint64_t expected_z4[] =
6438 {0xff0efd0cfb0af908, 0x020408101f3d79f1, 0xeeccaa8866442210};
6439 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6440
6441 // Neg (D) unpredicated
6442 uint64_t expected_z5[] =
6443 {0xfe0dfc0bfa09f808, 0x0103070f1e3c78f1, 0xedcba98765432110};
6444 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6445
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006446 // clang-format on
6447 }
6448}
6449
Jacob Bramley0093bb92019-10-04 15:54:10 +01006450TEST_SVE(sve_cpy) {
6451 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
6452 START();
6453
6454 // For simplicity, we re-use the same pg for various lane sizes.
6455 // For D lanes: 0, 1, 1
6456 // For S lanes: 0, 1, 1, 0, 1
6457 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6458 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6459
6460 PRegisterM pg = p7.Merging();
6461 Initialise(&masm, pg.VnB(), pg_in);
6462
6463 // These are merging operations, so we have to initialise the result registers
6464 // for each operation.
6465 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6466 __ Index(ZRegister(i, kBRegSize), 0, -1);
6467 }
6468
6469 // Recognisable values to copy.
6470 __ Mov(x0, 0xdeadbeefdeadbe42);
6471 __ Mov(x1, 0xdeadbeefdead8421);
6472 __ Mov(x2, 0xdeadbeef80042001);
6473 __ Mov(x3, 0x8000000420000001);
6474
6475 // Use NEON moves, to avoid testing SVE `cpy` against itself.
6476 __ Dup(v28.V2D(), x0);
6477 __ Dup(v29.V2D(), x1);
6478 __ Dup(v30.V2D(), x2);
6479 __ Dup(v31.V2D(), x3);
6480
6481 // Register forms (CPY_z_p_r)
6482 __ Cpy(z0.VnB(), pg, w0);
6483 __ Cpy(z1.VnH(), pg, x1); // X registers are accepted for small lanes.
6484 __ Cpy(z2.VnS(), pg, w2);
6485 __ Cpy(z3.VnD(), pg, x3);
6486
6487 // VRegister forms (CPY_z_p_v)
6488 __ Cpy(z4.VnB(), pg, b28);
6489 __ Cpy(z5.VnH(), pg, h29);
6490 __ Cpy(z6.VnS(), pg, s30);
6491 __ Cpy(z7.VnD(), pg, d31);
6492
6493 // Check that we can copy the stack pointer.
6494 __ Mov(x10, sp);
6495 __ Mov(sp, 0xabcabcabcabcabca); // Set sp to a known value.
6496 __ Cpy(z16.VnB(), pg, sp);
6497 __ Cpy(z17.VnH(), pg, wsp);
6498 __ Cpy(z18.VnS(), pg, wsp);
6499 __ Cpy(z19.VnD(), pg, sp);
6500 __ Mov(sp, x10); // Restore sp.
6501
6502 END();
6503
6504 if (CAN_RUN()) {
6505 RUN();
6506 // clang-format off
6507
6508 uint64_t expected_b[] =
6509 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6510 {0xe9eaebec424242f0, 0x42f2f34242f64242, 0xf942fbfcfdfeff42};
6511 ASSERT_EQUAL_SVE(expected_b, z0.VnD());
6512 ASSERT_EQUAL_SVE(expected_b, z4.VnD());
6513
6514 uint64_t expected_h[] =
6515 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6516 {0xe9eaebec8421eff0, 0xf1f28421f5f68421, 0x8421fbfcfdfe8421};
6517 ASSERT_EQUAL_SVE(expected_h, z1.VnD());
6518 ASSERT_EQUAL_SVE(expected_h, z5.VnD());
6519
6520 uint64_t expected_s[] =
6521 // pg: 0 0 1 1 0 1
6522 {0xe9eaebecedeeeff0, 0x8004200180042001, 0xf9fafbfc80042001};
6523 ASSERT_EQUAL_SVE(expected_s, z2.VnD());
6524 ASSERT_EQUAL_SVE(expected_s, z6.VnD());
6525
6526 uint64_t expected_d[] =
6527 // pg: 0 1 1
6528 {0xe9eaebecedeeeff0, 0x8000000420000001, 0x8000000420000001};
6529 ASSERT_EQUAL_SVE(expected_d, z3.VnD());
6530 ASSERT_EQUAL_SVE(expected_d, z7.VnD());
6531
6532
6533 uint64_t expected_b_sp[] =
6534 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6535 {0xe9eaebeccacacaf0, 0xcaf2f3cacaf6caca, 0xf9cafbfcfdfeffca};
6536 ASSERT_EQUAL_SVE(expected_b_sp, z16.VnD());
6537
6538 uint64_t expected_h_sp[] =
6539 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6540 {0xe9eaebecabcaeff0, 0xf1f2abcaf5f6abca, 0xabcafbfcfdfeabca};
6541 ASSERT_EQUAL_SVE(expected_h_sp, z17.VnD());
6542
6543 uint64_t expected_s_sp[] =
6544 // pg: 0 0 1 1 0 1
6545 {0xe9eaebecedeeeff0, 0xcabcabcacabcabca, 0xf9fafbfccabcabca};
6546 ASSERT_EQUAL_SVE(expected_s_sp, z18.VnD());
6547
6548 uint64_t expected_d_sp[] =
6549 // pg: 0 1 1
6550 {0xe9eaebecedeeeff0, 0xabcabcabcabcabca, 0xabcabcabcabcabca};
6551 ASSERT_EQUAL_SVE(expected_d_sp, z19.VnD());
6552
6553 // clang-format on
6554 }
6555}
6556
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006557TEST_SVE(sve_cpy_imm) {
6558 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6559 START();
6560
6561 // For simplicity, we re-use the same pg for various lane sizes.
6562 // For D lanes: 0, 1, 1
6563 // For S lanes: 0, 1, 1, 0, 1
6564 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6565 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6566
6567 PRegister pg = p7;
6568 Initialise(&masm, pg.VnB(), pg_in);
6569
6570 // These are (mostly) merging operations, so we have to initialise the result
6571 // registers for each operation.
6572 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6573 __ Index(ZRegister(i, kBRegSize), 0, -1);
6574 }
6575
6576 // Encodable integer forms (CPY_z_p_i)
6577 __ Cpy(z0.VnB(), pg.Merging(), 0);
6578 __ Cpy(z1.VnB(), pg.Zeroing(), 42);
6579 __ Cpy(z2.VnB(), pg.Merging(), -42);
6580 __ Cpy(z3.VnB(), pg.Zeroing(), 0xff);
6581 __ Cpy(z4.VnH(), pg.Merging(), 127);
6582 __ Cpy(z5.VnS(), pg.Zeroing(), -128);
6583 __ Cpy(z6.VnD(), pg.Merging(), -1);
6584
6585 // Forms encodable using fcpy.
6586 __ Cpy(z7.VnH(), pg.Merging(), Float16ToRawbits(Float16(-31.0)));
6587 __ Cpy(z8.VnS(), pg.Zeroing(), FloatToRawbits(2.0f));
6588 __ Cpy(z9.VnD(), pg.Merging(), DoubleToRawbits(-4.0));
6589
6590 // Other forms use a scratch register.
6591 __ Cpy(z10.VnH(), pg.Merging(), 0xff);
6592 __ Cpy(z11.VnD(), pg.Zeroing(), 0x0123456789abcdef);
6593
6594 END();
6595
6596 if (CAN_RUN()) {
6597 RUN();
6598 // clang-format off
6599
6600 uint64_t expected_z0[] =
6601 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6602 {0xe9eaebec000000f0, 0x00f2f30000f60000, 0xf900fbfcfdfeff00};
6603 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6604
6605 uint64_t expected_z1[] =
6606 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6607 {0x000000002a2a2a00, 0x2a00002a2a002a2a, 0x002a00000000002a};
6608 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6609
6610 uint64_t expected_z2[] =
6611 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6612 {0xe9eaebecd6d6d6f0, 0xd6f2f3d6d6f6d6d6, 0xf9d6fbfcfdfeffd6};
6613 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6614
6615 uint64_t expected_z3[] =
6616 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6617 {0x00000000ffffff00, 0xff0000ffff00ffff, 0x00ff0000000000ff};
6618 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6619
6620 uint64_t expected_z4[] =
6621 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6622 {0xe9eaebec007feff0, 0xf1f2007ff5f6007f, 0x007ffbfcfdfe007f};
6623 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6624
6625 uint64_t expected_z5[] =
6626 // pg: 0 0 1 1 0 1
6627 {0x0000000000000000, 0xffffff80ffffff80, 0x00000000ffffff80};
6628 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6629
6630 uint64_t expected_z6[] =
6631 // pg: 0 1 1
6632 {0xe9eaebecedeeeff0, 0xffffffffffffffff, 0xffffffffffffffff};
6633 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6634
6635 uint64_t expected_z7[] =
6636 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6637 {0xe9eaebeccfc0eff0, 0xf1f2cfc0f5f6cfc0, 0xcfc0fbfcfdfecfc0};
6638 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6639
6640 uint64_t expected_z8[] =
6641 // pg: 0 0 1 1 0 1
6642 {0x0000000000000000, 0x4000000040000000, 0x0000000040000000};
6643 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6644
6645 uint64_t expected_z9[] =
6646 // pg: 0 1 1
6647 {0xe9eaebecedeeeff0, 0xc010000000000000, 0xc010000000000000};
6648 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6649
6650 uint64_t expected_z10[] =
6651 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6652 {0xe9eaebec00ffeff0, 0xf1f200fff5f600ff, 0x00fffbfcfdfe00ff};
6653 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6654
6655 uint64_t expected_z11[] =
6656 // pg: 0 1 1
6657 {0x0000000000000000, 0x0123456789abcdef, 0x0123456789abcdef};
6658 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6659
6660 // clang-format on
6661 }
6662}
6663
6664TEST_SVE(sve_fcpy_imm) {
6665 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6666 START();
6667
6668 // For simplicity, we re-use the same pg for various lane sizes.
6669 // For D lanes: 0, 1, 1
6670 // For S lanes: 0, 1, 1, 0, 1
6671 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6672 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6673
6674 PRegister pg = p7;
6675 Initialise(&masm, pg.VnB(), pg_in);
6676
6677 // These are (mostly) merging operations, so we have to initialise the result
6678 // registers for each operation.
6679 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6680 __ Index(ZRegister(i, kBRegSize), 0, -1);
6681 }
6682
6683 // Encodable floating-point forms (FCPY_z_p_i)
6684 __ Fcpy(z1.VnH(), pg.Merging(), Float16(1.0));
6685 __ Fcpy(z2.VnH(), pg.Merging(), -2.0f);
6686 __ Fcpy(z3.VnH(), pg.Merging(), 3.0);
6687 __ Fcpy(z4.VnS(), pg.Merging(), Float16(-4.0));
6688 __ Fcpy(z5.VnS(), pg.Merging(), 5.0f);
6689 __ Fcpy(z6.VnS(), pg.Merging(), 6.0);
6690 __ Fcpy(z7.VnD(), pg.Merging(), Float16(7.0));
6691 __ Fcpy(z8.VnD(), pg.Merging(), 8.0f);
Martyn Capewell7db82102020-06-02 16:40:09 +01006692 __ Fmov(z9.VnD(), pg.Merging(), -9.0);
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006693
6694 // Unencodable immediates.
6695 __ Fcpy(z10.VnS(), pg.Merging(), 0.0);
6696 __ Fcpy(z11.VnH(), pg.Merging(), Float16(42.0));
6697 __ Fcpy(z12.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
6698 __ Fcpy(z13.VnH(), pg.Merging(), kFP64NegativeInfinity);
6699
Martyn Capewell7db82102020-06-02 16:40:09 +01006700 // Fmov alias.
6701 __ Fmov(z14.VnS(), pg.Merging(), 0.0);
6702 __ Fmov(z15.VnH(), pg.Merging(), Float16(42.0));
6703 __ Fmov(z16.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
6704 __ Fmov(z17.VnH(), pg.Merging(), kFP64NegativeInfinity);
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006705 END();
6706
6707 if (CAN_RUN()) {
6708 RUN();
6709 // clang-format off
6710
6711 // 1.0 as FP16: 0x3c00
6712 uint64_t expected_z1[] =
6713 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6714 {0xe9eaebec3c00eff0, 0xf1f23c00f5f63c00, 0x3c00fbfcfdfe3c00};
6715 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6716
6717 // -2.0 as FP16: 0xc000
6718 uint64_t expected_z2[] =
6719 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6720 {0xe9eaebecc000eff0, 0xf1f2c000f5f6c000, 0xc000fbfcfdfec000};
6721 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6722
6723 // 3.0 as FP16: 0x4200
6724 uint64_t expected_z3[] =
6725 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6726 {0xe9eaebec4200eff0, 0xf1f24200f5f64200, 0x4200fbfcfdfe4200};
6727 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6728
6729 // -4.0 as FP32: 0xc0800000
6730 uint64_t expected_z4[] =
6731 // pg: 0 0 1 1 0 1
6732 {0xe9eaebecedeeeff0, 0xc0800000c0800000, 0xf9fafbfcc0800000};
6733 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6734
6735 // 5.0 as FP32: 0x40a00000
6736 uint64_t expected_z5[] =
6737 // pg: 0 0 1 1 0 1
6738 {0xe9eaebecedeeeff0, 0x40a0000040a00000, 0xf9fafbfc40a00000};
6739 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6740
6741 // 6.0 as FP32: 0x40c00000
6742 uint64_t expected_z6[] =
6743 // pg: 0 0 1 1 0 1
6744 {0xe9eaebecedeeeff0, 0x40c0000040c00000, 0xf9fafbfc40c00000};
6745 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6746
6747 // 7.0 as FP64: 0x401c000000000000
6748 uint64_t expected_z7[] =
6749 // pg: 0 1 1
6750 {0xe9eaebecedeeeff0, 0x401c000000000000, 0x401c000000000000};
6751 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6752
6753 // 8.0 as FP64: 0x4020000000000000
6754 uint64_t expected_z8[] =
6755 // pg: 0 1 1
6756 {0xe9eaebecedeeeff0, 0x4020000000000000, 0x4020000000000000};
6757 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6758
6759 // -9.0 as FP64: 0xc022000000000000
6760 uint64_t expected_z9[] =
6761 // pg: 0 1 1
6762 {0xe9eaebecedeeeff0, 0xc022000000000000, 0xc022000000000000};
6763 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6764
6765 // 0.0 as FP32: 0x00000000
6766 uint64_t expected_z10[] =
6767 // pg: 0 0 1 1 0 1
6768 {0xe9eaebecedeeeff0, 0x0000000000000000, 0xf9fafbfc00000000};
6769 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6770
6771 // 42.0 as FP16: 0x5140
6772 uint64_t expected_z11[] =
6773 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6774 {0xe9eaebec5140eff0, 0xf1f25140f5f65140, 0x5140fbfcfdfe5140};
6775 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6776
6777 // Signalling NaN (with payload): 0x7ff0000012340000
6778 uint64_t expected_z12[] =
6779 // pg: 0 1 1
6780 {0xe9eaebecedeeeff0, 0x7ff0000012340000, 0x7ff0000012340000};
6781 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
6782
6783 // -infinity as FP16: 0xfc00
6784 uint64_t expected_z13[] =
6785 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6786 {0xe9eaebecfc00eff0, 0xf1f2fc00f5f6fc00, 0xfc00fbfcfdfefc00};
6787 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
6788
Martyn Capewell7db82102020-06-02 16:40:09 +01006789 ASSERT_EQUAL_SVE(z10.VnD(), z14.VnD());
6790 ASSERT_EQUAL_SVE(z11.VnD(), z15.VnD());
6791 ASSERT_EQUAL_SVE(z12.VnD(), z16.VnD());
6792 ASSERT_EQUAL_SVE(z13.VnD(), z17.VnD());
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006793 // clang-format on
6794 }
6795}
6796
TatWai Chong4f28df72019-08-14 17:50:30 -07006797TEST_SVE(sve_permute_vector_unpredicated_table_lookup) {
6798 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6799 START();
6800
6801 uint64_t table_inputs[] = {0xffeeddccbbaa9988, 0x7766554433221100};
6802
6803 int index_b[] = {255, 255, 11, 10, 15, 14, 13, 12, 1, 0, 4, 3, 7, 6, 5, 4};
6804
6805 int index_h[] = {5, 6, 7, 8, 2, 3, 6, 4};
6806
6807 int index_s[] = {1, 3, 2, 31, -1};
6808
6809 int index_d[] = {31, 1};
6810
6811 // Initialize the register with a value that doesn't existed in the table.
6812 __ Dup(z9.VnB(), 0x1f);
6813 InsrHelper(&masm, z9.VnD(), table_inputs);
6814
6815 ZRegister ind_b = z0.WithLaneSize(kBRegSize);
6816 ZRegister ind_h = z1.WithLaneSize(kHRegSize);
6817 ZRegister ind_s = z2.WithLaneSize(kSRegSize);
6818 ZRegister ind_d = z3.WithLaneSize(kDRegSize);
6819
6820 InsrHelper(&masm, ind_b, index_b);
6821 InsrHelper(&masm, ind_h, index_h);
6822 InsrHelper(&masm, ind_s, index_s);
6823 InsrHelper(&masm, ind_d, index_d);
6824
6825 __ Tbl(z26.VnB(), z9.VnB(), ind_b);
6826
6827 __ Tbl(z27.VnH(), z9.VnH(), ind_h);
6828
6829 __ Tbl(z28.VnS(), z9.VnS(), ind_s);
6830
6831 __ Tbl(z29.VnD(), z9.VnD(), ind_d);
6832
6833 END();
6834
6835 if (CAN_RUN()) {
6836 RUN();
6837
6838 // clang-format off
6839 unsigned z26_expected[] = {0x1f, 0x1f, 0xbb, 0xaa, 0xff, 0xee, 0xdd, 0xcc,
6840 0x11, 0x00, 0x44, 0x33, 0x77, 0x66, 0x55, 0x44};
6841
6842 unsigned z27_expected[] = {0xbbaa, 0xddcc, 0xffee, 0x1f1f,
6843 0x5544, 0x7766, 0xddcc, 0x9988};
6844
6845 unsigned z28_expected[] =
6846 {0x77665544, 0xffeeddcc, 0xbbaa9988, 0x1f1f1f1f, 0x1f1f1f1f};
6847
6848 uint64_t z29_expected[] = {0x1f1f1f1f1f1f1f1f, 0xffeeddccbbaa9988};
6849 // clang-format on
6850
6851 unsigned vl = config->sve_vl_in_bits();
6852 for (size_t i = 0; i < ArrayLength(index_b); i++) {
6853 int lane = static_cast<int>(ArrayLength(index_b) - i - 1);
6854 if (!core.HasSVELane(z26.VnB(), lane)) break;
6855 uint64_t expected = (vl > (index_b[i] * kBRegSize)) ? z26_expected[i] : 0;
6856 ASSERT_EQUAL_SVE_LANE(expected, z26.VnB(), lane);
6857 }
6858
6859 for (size_t i = 0; i < ArrayLength(index_h); i++) {
6860 int lane = static_cast<int>(ArrayLength(index_h) - i - 1);
6861 if (!core.HasSVELane(z27.VnH(), lane)) break;
6862 uint64_t expected = (vl > (index_h[i] * kHRegSize)) ? z27_expected[i] : 0;
6863 ASSERT_EQUAL_SVE_LANE(expected, z27.VnH(), lane);
6864 }
6865
6866 for (size_t i = 0; i < ArrayLength(index_s); i++) {
6867 int lane = static_cast<int>(ArrayLength(index_s) - i - 1);
6868 if (!core.HasSVELane(z28.VnS(), lane)) break;
6869 uint64_t expected = (vl > (index_s[i] * kSRegSize)) ? z28_expected[i] : 0;
6870 ASSERT_EQUAL_SVE_LANE(expected, z28.VnS(), lane);
6871 }
6872
6873 for (size_t i = 0; i < ArrayLength(index_d); i++) {
6874 int lane = static_cast<int>(ArrayLength(index_d) - i - 1);
6875 if (!core.HasSVELane(z29.VnD(), lane)) break;
6876 uint64_t expected = (vl > (index_d[i] * kDRegSize)) ? z29_expected[i] : 0;
6877 ASSERT_EQUAL_SVE_LANE(expected, z29.VnD(), lane);
6878 }
6879 }
6880}
6881
Jacob Bramley199339d2019-08-05 18:49:13 +01006882TEST_SVE(ldr_str_z_bi) {
6883 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6884 START();
6885
6886 int vl = config->sve_vl_in_bytes();
6887
6888 // The immediate can address [-256, 255] times the VL, so allocate enough
6889 // space to exceed that in both directions.
6890 int data_size = vl * 1024;
6891
6892 uint8_t* data = new uint8_t[data_size];
6893 memset(data, 0, data_size);
6894
6895 // Set the base half-way through the buffer so we can use negative indices.
6896 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6897
6898 __ Index(z1.VnB(), 1, 3);
6899 __ Index(z2.VnB(), 2, 5);
6900 __ Index(z3.VnB(), 3, 7);
6901 __ Index(z4.VnB(), 4, 11);
6902 __ Index(z5.VnB(), 5, 13);
6903 __ Index(z6.VnB(), 6, 2);
6904 __ Index(z7.VnB(), 7, 3);
6905 __ Index(z8.VnB(), 8, 5);
6906 __ Index(z9.VnB(), 9, 7);
6907
6908 // Encodable cases.
6909 __ Str(z1, SVEMemOperand(x0));
6910 __ Str(z2, SVEMemOperand(x0, 2, SVE_MUL_VL));
6911 __ Str(z3, SVEMemOperand(x0, -3, SVE_MUL_VL));
6912 __ Str(z4, SVEMemOperand(x0, 255, SVE_MUL_VL));
6913 __ Str(z5, SVEMemOperand(x0, -256, SVE_MUL_VL));
6914
Jacob Bramley6ebbba62019-10-09 15:02:10 +01006915 // Cases that fall back on `CalculateSVEAddress`.
Jacob Bramley199339d2019-08-05 18:49:13 +01006916 __ Str(z6, SVEMemOperand(x0, 6 * vl));
6917 __ Str(z7, SVEMemOperand(x0, -7 * vl));
6918 __ Str(z8, SVEMemOperand(x0, 314, SVE_MUL_VL));
6919 __ Str(z9, SVEMemOperand(x0, -314, SVE_MUL_VL));
6920
6921 // Corresponding loads.
6922 __ Ldr(z11, SVEMemOperand(x0, xzr)); // Test xzr operand.
6923 __ Ldr(z12, SVEMemOperand(x0, 2, SVE_MUL_VL));
6924 __ Ldr(z13, SVEMemOperand(x0, -3, SVE_MUL_VL));
6925 __ Ldr(z14, SVEMemOperand(x0, 255, SVE_MUL_VL));
6926 __ Ldr(z15, SVEMemOperand(x0, -256, SVE_MUL_VL));
6927
6928 __ Ldr(z16, SVEMemOperand(x0, 6 * vl));
6929 __ Ldr(z17, SVEMemOperand(x0, -7 * vl));
6930 __ Ldr(z18, SVEMemOperand(x0, 314, SVE_MUL_VL));
6931 __ Ldr(z19, SVEMemOperand(x0, -314, SVE_MUL_VL));
6932
6933 END();
6934
6935 if (CAN_RUN()) {
6936 RUN();
6937
6938 uint8_t* expected = new uint8_t[data_size];
6939 memset(expected, 0, data_size);
6940 uint8_t* middle = &expected[data_size / 2];
6941
6942 for (int i = 0; i < vl; i++) {
6943 middle[i] = (1 + (3 * i)) & 0xff; // z1
6944 middle[(2 * vl) + i] = (2 + (5 * i)) & 0xff; // z2
6945 middle[(-3 * vl) + i] = (3 + (7 * i)) & 0xff; // z3
6946 middle[(255 * vl) + i] = (4 + (11 * i)) & 0xff; // z4
6947 middle[(-256 * vl) + i] = (5 + (13 * i)) & 0xff; // z5
6948 middle[(6 * vl) + i] = (6 + (2 * i)) & 0xff; // z6
6949 middle[(-7 * vl) + i] = (7 + (3 * i)) & 0xff; // z7
6950 middle[(314 * vl) + i] = (8 + (5 * i)) & 0xff; // z8
6951 middle[(-314 * vl) + i] = (9 + (7 * i)) & 0xff; // z9
6952 }
6953
Jacob Bramley33c99f92019-10-08 15:24:12 +01006954 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
Jacob Bramley199339d2019-08-05 18:49:13 +01006955
6956 ASSERT_EQUAL_SVE(z1, z11);
6957 ASSERT_EQUAL_SVE(z2, z12);
6958 ASSERT_EQUAL_SVE(z3, z13);
6959 ASSERT_EQUAL_SVE(z4, z14);
6960 ASSERT_EQUAL_SVE(z5, z15);
6961 ASSERT_EQUAL_SVE(z6, z16);
6962 ASSERT_EQUAL_SVE(z7, z17);
6963 ASSERT_EQUAL_SVE(z8, z18);
6964 ASSERT_EQUAL_SVE(z9, z19);
6965
6966 delete[] expected;
6967 }
6968 delete[] data;
6969}
6970
6971TEST_SVE(ldr_str_p_bi) {
6972 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6973 START();
6974
6975 int vl = config->sve_vl_in_bytes();
6976 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
6977 int pl = vl / kZRegBitsPerPRegBit;
6978
6979 // The immediate can address [-256, 255] times the PL, so allocate enough
6980 // space to exceed that in both directions.
6981 int data_size = pl * 1024;
6982
6983 uint8_t* data = new uint8_t[data_size];
6984 memset(data, 0, data_size);
6985
6986 // Set the base half-way through the buffer so we can use negative indices.
6987 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6988
6989 uint64_t pattern[4] = {0x1010101011101111,
6990 0x0010111011000101,
6991 0x1001101110010110,
6992 0x1010110101100011};
6993 for (int i = 8; i <= 15; i++) {
6994 // Initialise p8-p15 with a conveniently-recognisable, non-zero pattern.
6995 Initialise(&masm,
6996 PRegister(i),
6997 pattern[3] * i,
6998 pattern[2] * i,
6999 pattern[1] * i,
7000 pattern[0] * i);
7001 }
7002
7003 // Encodable cases.
7004 __ Str(p8, SVEMemOperand(x0));
7005 __ Str(p9, SVEMemOperand(x0, 2, SVE_MUL_VL));
7006 __ Str(p10, SVEMemOperand(x0, -3, SVE_MUL_VL));
7007 __ Str(p11, SVEMemOperand(x0, 255, SVE_MUL_VL));
7008
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007009 // Cases that fall back on `CalculateSVEAddress`.
Jacob Bramley199339d2019-08-05 18:49:13 +01007010 __ Str(p12, SVEMemOperand(x0, 6 * pl));
7011 __ Str(p13, SVEMemOperand(x0, -7 * pl));
7012 __ Str(p14, SVEMemOperand(x0, 314, SVE_MUL_VL));
7013 __ Str(p15, SVEMemOperand(x0, -314, SVE_MUL_VL));
7014
7015 // Corresponding loads.
7016 __ Ldr(p0, SVEMemOperand(x0));
7017 __ Ldr(p1, SVEMemOperand(x0, 2, SVE_MUL_VL));
7018 __ Ldr(p2, SVEMemOperand(x0, -3, SVE_MUL_VL));
7019 __ Ldr(p3, SVEMemOperand(x0, 255, SVE_MUL_VL));
7020
7021 __ Ldr(p4, SVEMemOperand(x0, 6 * pl));
7022 __ Ldr(p5, SVEMemOperand(x0, -7 * pl));
7023 __ Ldr(p6, SVEMemOperand(x0, 314, SVE_MUL_VL));
7024 __ Ldr(p7, SVEMemOperand(x0, -314, SVE_MUL_VL));
7025
7026 END();
7027
7028 if (CAN_RUN()) {
7029 RUN();
7030
7031 uint8_t* expected = new uint8_t[data_size];
7032 memset(expected, 0, data_size);
7033 uint8_t* middle = &expected[data_size / 2];
7034
7035 for (int i = 0; i < pl; i++) {
7036 int bit_index = (i % sizeof(pattern[0])) * kBitsPerByte;
7037 size_t index = i / sizeof(pattern[0]);
7038 VIXL_ASSERT(index < ArrayLength(pattern));
7039 uint64_t byte = (pattern[index] >> bit_index) & 0xff;
7040 // Each byte of `pattern` can be multiplied by 15 without carry.
7041 VIXL_ASSERT((byte * 15) <= 0xff);
7042
7043 middle[i] = byte * 8; // p8
7044 middle[(2 * pl) + i] = byte * 9; // p9
7045 middle[(-3 * pl) + i] = byte * 10; // p10
7046 middle[(255 * pl) + i] = byte * 11; // p11
7047 middle[(6 * pl) + i] = byte * 12; // p12
7048 middle[(-7 * pl) + i] = byte * 13; // p13
7049 middle[(314 * pl) + i] = byte * 14; // p14
7050 middle[(-314 * pl) + i] = byte * 15; // p15
7051 }
7052
Jacob Bramley33c99f92019-10-08 15:24:12 +01007053 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
Jacob Bramley199339d2019-08-05 18:49:13 +01007054
7055 ASSERT_EQUAL_SVE(p0, p8);
7056 ASSERT_EQUAL_SVE(p1, p9);
7057 ASSERT_EQUAL_SVE(p2, p10);
7058 ASSERT_EQUAL_SVE(p3, p11);
7059 ASSERT_EQUAL_SVE(p4, p12);
7060 ASSERT_EQUAL_SVE(p5, p13);
7061 ASSERT_EQUAL_SVE(p6, p14);
7062 ASSERT_EQUAL_SVE(p7, p15);
7063
7064 delete[] expected;
7065 }
7066 delete[] data;
7067}
7068
Jacob Bramleye668b202019-08-14 17:57:34 +01007069template <typename T>
7070static void MemoryWrite(uint8_t* base, int64_t offset, int64_t index, T data) {
7071 memcpy(base + offset + (index * sizeof(data)), &data, sizeof(data));
7072}
7073
7074TEST_SVE(sve_ld1_st1_contiguous) {
7075 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7076 START();
7077
7078 int vl = config->sve_vl_in_bytes();
7079
7080 // The immediate can address [-8, 7] times the VL, so allocate enough space to
7081 // exceed that in both directions.
7082 int data_size = vl * 128;
7083
7084 uint8_t* data = new uint8_t[data_size];
7085 memset(data, 0, data_size);
7086
Martyn Capewell452ad8b2020-03-19 15:49:57 +00007087 // Set the base half-way through the buffer so we can use negative indices.
Jacob Bramleye668b202019-08-14 17:57:34 +01007088 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7089
Jacob Bramleye668b202019-08-14 17:57:34 +01007090 // Encodable scalar-plus-immediate cases.
7091 __ Index(z1.VnB(), 1, -3);
7092 __ Ptrue(p1.VnB());
7093 __ St1b(z1.VnB(), p1, SVEMemOperand(x0));
7094
7095 __ Index(z2.VnH(), -2, 5);
7096 __ Ptrue(p2.VnH(), SVE_MUL3);
7097 __ St1b(z2.VnH(), p2, SVEMemOperand(x0, 7, SVE_MUL_VL));
7098
7099 __ Index(z3.VnS(), 3, -7);
7100 __ Ptrue(p3.VnS(), SVE_POW2);
7101 __ St1h(z3.VnS(), p3, SVEMemOperand(x0, -8, SVE_MUL_VL));
7102
7103 // Encodable scalar-plus-scalar cases.
7104 __ Index(z4.VnD(), -4, 11);
7105 __ Ptrue(p4.VnD(), SVE_VL3);
7106 __ Addvl(x1, x0, 8); // Try not to overlap with VL-dependent cases.
7107 __ Mov(x2, 17);
7108 __ St1b(z4.VnD(), p4, SVEMemOperand(x1, x2));
7109
7110 __ Index(z5.VnD(), 6, -2);
7111 __ Ptrue(p5.VnD(), SVE_VL16);
TatWai Chong6205eb42019-09-24 10:07:20 +01007112 __ Addvl(x3, x0, 10); // Try not to overlap with VL-dependent cases.
7113 __ Mov(x4, 6);
7114 __ St1d(z5.VnD(), p5, SVEMemOperand(x3, x4, LSL, 3));
Jacob Bramleye668b202019-08-14 17:57:34 +01007115
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007116 // Unencodable cases fall back on `CalculateSVEAddress`.
Jacob Bramleye668b202019-08-14 17:57:34 +01007117 __ Index(z6.VnS(), -7, 3);
7118 // Setting SVE_ALL on B lanes checks that the Simulator ignores irrelevant
7119 // predicate bits when handling larger lanes.
7120 __ Ptrue(p6.VnB(), SVE_ALL);
7121 __ St1w(z6.VnS(), p6, SVEMemOperand(x0, 42, SVE_MUL_VL));
7122
TatWai Chong6205eb42019-09-24 10:07:20 +01007123 __ Index(z7.VnD(), 32, -11);
7124 __ Ptrue(p7.VnD(), SVE_MUL4);
7125 __ St1w(z7.VnD(), p7, SVEMemOperand(x0, 22, SVE_MUL_VL));
Jacob Bramleye668b202019-08-14 17:57:34 +01007126
TatWai Chong6205eb42019-09-24 10:07:20 +01007127 // Corresponding loads.
7128 __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0));
7129 __ Ld1b(z9.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7130 __ Ld1h(z10.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7131 __ Ld1b(z11.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7132 __ Ld1d(z12.VnD(), p5.Zeroing(), SVEMemOperand(x3, x4, LSL, 3));
7133 __ Ld1w(z13.VnS(), p6.Zeroing(), SVEMemOperand(x0, 42, SVE_MUL_VL));
7134
7135 __ Ld1sb(z14.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7136 __ Ld1sh(z15.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7137 __ Ld1sb(z16.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7138 __ Ld1sw(z17.VnD(), p7.Zeroing(), SVEMemOperand(x0, 22, SVE_MUL_VL));
7139
7140 // We can test ld1 by comparing the value loaded with the value stored. In
7141 // most cases, there are two complications:
7142 // - Loads have zeroing predication, so we have to clear the inactive
7143 // elements on our reference.
7144 // - We have to replicate any sign- or zero-extension.
7145
7146 // Ld1b(z8.VnB(), ...)
7147 __ Dup(z18.VnB(), 0);
7148 __ Mov(z18.VnB(), p1.Merging(), z1.VnB());
7149
7150 // Ld1b(z9.VnH(), ...)
7151 __ Dup(z19.VnH(), 0);
7152 __ Uxtb(z19.VnH(), p2.Merging(), z2.VnH());
7153
7154 // Ld1h(z10.VnS(), ...)
7155 __ Dup(z20.VnS(), 0);
7156 __ Uxth(z20.VnS(), p3.Merging(), z3.VnS());
7157
7158 // Ld1b(z11.VnD(), ...)
7159 __ Dup(z21.VnD(), 0);
7160 __ Uxtb(z21.VnD(), p4.Merging(), z4.VnD());
7161
7162 // Ld1d(z12.VnD(), ...)
7163 __ Dup(z22.VnD(), 0);
7164 __ Mov(z22.VnD(), p5.Merging(), z5.VnD());
7165
7166 // Ld1w(z13.VnS(), ...)
7167 __ Dup(z23.VnS(), 0);
7168 __ Mov(z23.VnS(), p6.Merging(), z6.VnS());
7169
7170 // Ld1sb(z14.VnH(), ...)
7171 __ Dup(z24.VnH(), 0);
7172 __ Sxtb(z24.VnH(), p2.Merging(), z2.VnH());
7173
7174 // Ld1sh(z15.VnS(), ...)
7175 __ Dup(z25.VnS(), 0);
7176 __ Sxth(z25.VnS(), p3.Merging(), z3.VnS());
7177
7178 // Ld1sb(z16.VnD(), ...)
7179 __ Dup(z26.VnD(), 0);
7180 __ Sxtb(z26.VnD(), p4.Merging(), z4.VnD());
7181
7182 // Ld1sw(z17.VnD(), ...)
7183 __ Dup(z27.VnD(), 0);
7184 __ Sxtw(z27.VnD(), p7.Merging(), z7.VnD());
Jacob Bramleye668b202019-08-14 17:57:34 +01007185
7186 END();
7187
7188 if (CAN_RUN()) {
7189 RUN();
7190
7191 uint8_t* expected = new uint8_t[data_size];
7192 memset(expected, 0, data_size);
7193 uint8_t* middle = &expected[data_size / 2];
7194
7195 int vl_b = vl / kBRegSizeInBytes;
7196 int vl_h = vl / kHRegSizeInBytes;
7197 int vl_s = vl / kSRegSizeInBytes;
7198 int vl_d = vl / kDRegSizeInBytes;
7199
7200 // Encodable cases.
7201
7202 // st1b { z1.b }, SVE_ALL
7203 for (int i = 0; i < vl_b; i++) {
7204 MemoryWrite(middle, 0, i, static_cast<uint8_t>(1 - (3 * i)));
7205 }
7206
7207 // st1b { z2.h }, SVE_MUL3
7208 int vl_h_mul3 = vl_h - (vl_h % 3);
7209 for (int i = 0; i < vl_h_mul3; i++) {
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007210 int64_t offset = 7 * static_cast<int>(vl / (kHRegSize / kBRegSize));
7211 MemoryWrite(middle, offset, i, static_cast<uint8_t>(-2 + (5 * i)));
Jacob Bramleye668b202019-08-14 17:57:34 +01007212 }
7213
7214 // st1h { z3.s }, SVE_POW2
7215 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7216 for (int i = 0; i < vl_s_pow2; i++) {
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007217 int64_t offset = -8 * static_cast<int>(vl / (kSRegSize / kHRegSize));
7218 MemoryWrite(middle, offset, i, static_cast<uint16_t>(3 - (7 * i)));
Jacob Bramleye668b202019-08-14 17:57:34 +01007219 }
7220
7221 // st1b { z4.d }, SVE_VL3
7222 if (vl_d >= 3) {
7223 for (int i = 0; i < 3; i++) {
7224 MemoryWrite(middle,
7225 (8 * vl) + 17,
7226 i,
7227 static_cast<uint8_t>(-4 + (11 * i)));
7228 }
7229 }
7230
7231 // st1d { z5.d }, SVE_VL16
7232 if (vl_d >= 16) {
7233 for (int i = 0; i < 16; i++) {
7234 MemoryWrite(middle,
7235 (10 * vl) + (6 * kDRegSizeInBytes),
7236 i,
7237 static_cast<uint64_t>(6 - (2 * i)));
7238 }
7239 }
7240
7241 // Unencodable cases.
7242
7243 // st1w { z6.s }, SVE_ALL
7244 for (int i = 0; i < vl_s; i++) {
7245 MemoryWrite(middle, 42 * vl, i, static_cast<uint32_t>(-7 + (3 * i)));
7246 }
7247
TatWai Chong6205eb42019-09-24 10:07:20 +01007248 // st1w { z7.d }, SVE_MUL4
7249 int vl_d_mul4 = vl_d - (vl_d % 4);
7250 for (int i = 0; i < vl_d_mul4; i++) {
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007251 int64_t offset = 22 * static_cast<int>(vl / (kDRegSize / kWRegSize));
7252 MemoryWrite(middle, offset, i, static_cast<uint32_t>(32 + (-11 * i)));
TatWai Chong6205eb42019-09-24 10:07:20 +01007253 }
7254
Jacob Bramley33c99f92019-10-08 15:24:12 +01007255 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
Jacob Bramleye668b202019-08-14 17:57:34 +01007256
TatWai Chong6205eb42019-09-24 10:07:20 +01007257 // Check that we loaded back the expected values.
7258
7259 ASSERT_EQUAL_SVE(z18, z8);
7260 ASSERT_EQUAL_SVE(z19, z9);
7261 ASSERT_EQUAL_SVE(z20, z10);
7262 ASSERT_EQUAL_SVE(z21, z11);
7263 ASSERT_EQUAL_SVE(z22, z12);
7264 ASSERT_EQUAL_SVE(z23, z13);
7265 ASSERT_EQUAL_SVE(z24, z14);
7266 ASSERT_EQUAL_SVE(z25, z15);
7267 ASSERT_EQUAL_SVE(z26, z16);
7268 ASSERT_EQUAL_SVE(z27, z17);
7269
Jacob Bramleye668b202019-08-14 17:57:34 +01007270 delete[] expected;
7271 }
7272 delete[] data;
7273}
7274
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007275TEST_SVE(sve_ld2_st2_scalar_plus_imm) {
7276 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7277 START();
7278
7279 int vl = config->sve_vl_in_bytes();
7280
7281 // The immediate can address [-16, 14] times the VL, so allocate enough space
7282 // to exceed that in both directions.
7283 int data_size = vl * 128;
7284
7285 uint8_t* data = new uint8_t[data_size];
7286 memset(data, 0, data_size);
7287
7288 // Set the base half-way through the buffer so we can use negative indeces.
7289 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7290
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007291 __ Index(z14.VnB(), 1, -3);
7292 __ Index(z15.VnB(), 2, -3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007293 __ Ptrue(p0.VnB());
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007294 __ St2b(z14.VnB(), z15.VnB(), p0, SVEMemOperand(x0));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007295
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007296 __ Index(z16.VnH(), -2, 5);
7297 __ Index(z17.VnH(), -3, 5);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007298 __ Ptrue(p1.VnH(), SVE_MUL3);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007299 __ St2h(z16.VnH(), z17.VnH(), p1, SVEMemOperand(x0, 8, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007300
7301 // Wrap around from z31 to z0.
7302 __ Index(z31.VnS(), 3, -7);
7303 __ Index(z0.VnS(), 4, -7);
7304 __ Ptrue(p2.VnS(), SVE_POW2);
7305 __ St2w(z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL));
7306
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007307 __ Index(z18.VnD(), -7, 3);
7308 __ Index(z19.VnD(), -8, 3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007309 // Sparse predication, including some irrelevant bits (0xe). To make the
7310 // results easy to check, activate each lane <n> where n is a multiple of 5.
7311 Initialise(&masm,
7312 p3,
7313 0xeee10000000001ee,
7314 0xeeeeeee100000000,
7315 0x01eeeeeeeee10000,
7316 0x000001eeeeeeeee1);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007317 __ St2d(z18.VnD(), z19.VnD(), p3, SVEMemOperand(x0, 14, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007318
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007319 // We can test ld2 by comparing the values loaded with the values stored.
7320 // There are two complications:
7321 // - Loads have zeroing predication, so we have to clear the inactive
7322 // elements on our reference.
7323 // - We want to test both loads and stores that span { z31, z0 }, so we have
7324 // to move some values around.
7325 //
7326 // Registers z4-z11 will hold as-stored values (with inactive elements
7327 // cleared). Registers z20-z27 will hold the values that were loaded.
7328
7329 // Ld2b(z14.VnB(), z15.VnB(), ...)
7330 __ Dup(z4.VnB(), 0);
7331 __ Dup(z5.VnB(), 0);
7332 __ Mov(z4.VnB(), p0.Merging(), z14.VnB());
7333 __ Mov(z5.VnB(), p0.Merging(), z15.VnB());
7334
7335 // Ld2h(z16.VnH(), z17.VnH(), ...)
7336 __ Dup(z6.VnH(), 0);
7337 __ Dup(z7.VnH(), 0);
7338 __ Mov(z6.VnH(), p1.Merging(), z16.VnH());
7339 __ Mov(z7.VnH(), p1.Merging(), z17.VnH());
7340
7341 // Ld2w(z31.VnS(), z0.VnS(), ...)
7342 __ Dup(z8.VnS(), 0);
7343 __ Dup(z9.VnS(), 0);
7344 __ Mov(z8.VnS(), p2.Merging(), z31.VnS());
7345 __ Mov(z9.VnS(), p2.Merging(), z0.VnS());
7346
7347 // Ld2d(z18.VnD(), z19.VnD(), ...)
7348 __ Dup(z10.VnD(), 0);
7349 __ Dup(z11.VnD(), 0);
7350 __ Mov(z10.VnD(), p3.Merging(), z18.VnD());
7351 __ Mov(z11.VnD(), p3.Merging(), z19.VnD());
7352
7353 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7354 __ Ld2b(z31.VnB(), z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7355 __ Mov(z20, z31);
7356 __ Mov(z21, z0);
7357
7358 __ Ld2h(z22.VnH(), z23.VnH(), p1.Zeroing(), SVEMemOperand(x0, 8, SVE_MUL_VL));
7359 __ Ld2w(z24.VnS(),
7360 z25.VnS(),
7361 p2.Zeroing(),
7362 SVEMemOperand(x0, -12, SVE_MUL_VL));
7363 __ Ld2d(z26.VnD(),
7364 z27.VnD(),
7365 p3.Zeroing(),
7366 SVEMemOperand(x0, 14, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007367
7368 END();
7369
7370 if (CAN_RUN()) {
7371 RUN();
7372
7373 uint8_t* expected = new uint8_t[data_size];
7374 memset(expected, 0, data_size);
7375 uint8_t* middle = &expected[data_size / 2];
7376
7377 int vl_b = vl / kBRegSizeInBytes;
7378 int vl_h = vl / kHRegSizeInBytes;
7379 int vl_s = vl / kSRegSizeInBytes;
7380 int vl_d = vl / kDRegSizeInBytes;
7381
7382 int reg_count = 2;
7383
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007384 // st2b { z14.b, z15.b }, SVE_ALL
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007385 for (int i = 0; i < vl_b; i++) {
7386 uint8_t lane0 = 1 - (3 * i);
7387 uint8_t lane1 = 2 - (3 * i);
7388 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7389 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7390 }
7391
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007392 // st2h { z16.h, z17.h }, SVE_MUL3
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007393 int vl_h_mul3 = vl_h - (vl_h % 3);
7394 for (int i = 0; i < vl_h_mul3; i++) {
7395 int64_t offset = 8 * vl;
7396 uint16_t lane0 = -2 + (5 * i);
7397 uint16_t lane1 = -3 + (5 * i);
7398 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7399 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7400 }
7401
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007402 // st2w { z31.s, z0.s }, SVE_POW2
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007403 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7404 for (int i = 0; i < vl_s_pow2; i++) {
7405 int64_t offset = -12 * vl;
7406 uint32_t lane0 = 3 - (7 * i);
7407 uint32_t lane1 = 4 - (7 * i);
7408 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7409 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7410 }
7411
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007412 // st2d { z18.d, z19.d }, ((i % 5) == 0)
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007413 for (int i = 0; i < vl_d; i++) {
7414 if ((i % 5) == 0) {
7415 int64_t offset = 14 * vl;
7416 uint64_t lane0 = -7 + (3 * i);
7417 uint64_t lane1 = -8 + (3 * i);
7418 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7419 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7420 }
7421 }
7422
7423 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7424
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007425 // Check that we loaded back the expected values.
7426
7427 // st2b/ld2b
7428 ASSERT_EQUAL_SVE(z4, z20);
7429 ASSERT_EQUAL_SVE(z5, z21);
7430
7431 // st2h/ld2h
7432 ASSERT_EQUAL_SVE(z6, z22);
7433 ASSERT_EQUAL_SVE(z7, z23);
7434
7435 // st2w/ld2w
7436 ASSERT_EQUAL_SVE(z8, z24);
7437 ASSERT_EQUAL_SVE(z9, z25);
7438
7439 // st2d/ld2d
7440 ASSERT_EQUAL_SVE(z10, z26);
7441 ASSERT_EQUAL_SVE(z11, z27);
7442
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007443 delete[] expected;
7444 }
7445 delete[] data;
7446}
7447
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007448TEST_SVE(sve_ld2_st2_scalar_plus_scalar) {
7449 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7450 START();
7451
7452 int vl = config->sve_vl_in_bytes();
7453
7454 // Allocate plenty of space to enable indexing in both directions.
7455 int data_size = vl * 128;
7456
7457 uint8_t* data = new uint8_t[data_size];
7458 memset(data, 0, data_size);
7459
7460 // Set the base half-way through the buffer so we can use negative indeces.
7461 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7462
Jacob Bramleye483ce52019-11-05 16:52:29 +00007463 __ Index(z10.VnB(), -4, 11);
7464 __ Index(z11.VnB(), -5, 11);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007465 __ Ptrue(p7.VnB(), SVE_MUL4);
7466 __ Mov(x1, 0);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007467 __ St2b(z10.VnB(), z11.VnB(), p7, SVEMemOperand(x0, x1));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007468
Jacob Bramleye483ce52019-11-05 16:52:29 +00007469 __ Index(z12.VnH(), 6, -2);
7470 __ Index(z13.VnH(), 7, -2);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007471 __ Ptrue(p6.VnH(), SVE_VL16);
7472 __ Rdvl(x2, 3); // Make offsets VL-dependent so we can avoid overlap.
Jacob Bramleye483ce52019-11-05 16:52:29 +00007473 __ St2h(z12.VnH(), z13.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007474
Jacob Bramleye483ce52019-11-05 16:52:29 +00007475 __ Index(z14.VnS(), -7, 3);
7476 __ Index(z15.VnS(), -8, 3);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007477 // Sparse predication, including some irrelevant bits (0xe). To make the
7478 // results easy to check, activate each lane <n> where n is a multiple of 5.
7479 Initialise(&masm,
7480 p5,
7481 0xeee1000010000100,
7482 0x001eeee100001000,
7483 0x0100001eeee10000,
7484 0x10000100001eeee1);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007485 __ Rdvl(x3, -3);
7486 __ St2w(z14.VnS(), z15.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007487
7488 // Wrap around from z31 to z0.
7489 __ Index(z31.VnD(), 32, -11);
7490 __ Index(z0.VnD(), 33, -11);
7491 __ Ptrue(p4.VnD(), SVE_MUL3);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007492 __ Rdvl(x4, 1);
7493 __ St2d(z31.VnD(), z0.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007494
Jacob Bramleye483ce52019-11-05 16:52:29 +00007495 // We can test ld2 by comparing the values loaded with the values stored.
7496 // There are two complications:
7497 // - Loads have zeroing predication, so we have to clear the inactive
7498 // elements on our reference.
7499 // - We want to test both loads and stores that span { z31, z0 }, so we have
7500 // to move some values around.
7501 //
7502 // Registers z4-z11 will hold as-stored values (with inactive elements
7503 // cleared). Registers z20-z27 will hold the values that were loaded.
7504
7505 // Ld2b(z20.VnB(), z21.VnB(), ...)
7506 __ Dup(z4.VnB(), 0);
7507 __ Dup(z5.VnB(), 0);
7508 __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7509 __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7510
7511 // Ld2h(z22.VnH(), z23.VnH(), ...)
7512 __ Dup(z6.VnH(), 0);
7513 __ Dup(z7.VnH(), 0);
7514 __ Mov(z6.VnH(), p6.Merging(), z12.VnH());
7515 __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7516
7517 // Ld2w(z24.VnS(), z25.VnS(), ...)
7518 __ Dup(z8.VnS(), 0);
7519 __ Dup(z9.VnS(), 0);
7520 __ Mov(z8.VnS(), p5.Merging(), z14.VnS());
7521 __ Mov(z9.VnS(), p5.Merging(), z15.VnS());
7522
7523 // Ld2d(z31.VnD(), z0.VnD(), ...)
7524 __ Dup(z10.VnD(), 0);
7525 __ Dup(z11.VnD(), 0);
7526 __ Mov(z10.VnD(), p4.Merging(), z31.VnD());
7527 __ Mov(z11.VnD(), p4.Merging(), z0.VnD());
7528
7529 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7530 __ Ld2b(z31.VnB(), z0.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1));
7531 __ Mov(z20, z31);
7532 __ Mov(z21, z0);
7533
7534 __ Ld2h(z22.VnH(), z23.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
7535 __ Ld2w(z24.VnS(), z25.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2));
7536 __ Ld2d(z26.VnD(), z27.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007537
7538 END();
7539
7540 if (CAN_RUN()) {
7541 RUN();
7542
7543 uint8_t* expected = new uint8_t[data_size];
7544 memset(expected, 0, data_size);
7545 uint8_t* middle = &expected[data_size / 2];
7546
7547 int vl_b = vl / kBRegSizeInBytes;
7548 int vl_h = vl / kHRegSizeInBytes;
7549 int vl_s = vl / kSRegSizeInBytes;
7550 int vl_d = vl / kDRegSizeInBytes;
7551
7552 int reg_count = 2;
7553
Jacob Bramleye483ce52019-11-05 16:52:29 +00007554 // st2b { z10.b, z11.b }, SVE_MUL4
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007555 int vl_b_mul4 = vl_b - (vl_b % 4);
7556 for (int i = 0; i < vl_b_mul4; i++) {
7557 uint8_t lane0 = -4 + (11 * i);
7558 uint8_t lane1 = -5 + (11 * i);
7559 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7560 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7561 }
7562
Jacob Bramleye483ce52019-11-05 16:52:29 +00007563 // st2h { z12.h, z13.h }, SVE_VL16
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007564 if (vl_h >= 16) {
7565 for (int i = 0; i < 16; i++) {
7566 int64_t offset = (3 << kHRegSizeInBytesLog2) * vl;
7567 uint16_t lane0 = 6 - (2 * i);
7568 uint16_t lane1 = 7 - (2 * i);
7569 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7570 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7571 }
7572 }
7573
Jacob Bramleye483ce52019-11-05 16:52:29 +00007574 // st2w { z14.s, z15.s }, ((i % 5) == 0)
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007575 for (int i = 0; i < vl_s; i++) {
7576 if ((i % 5) == 0) {
7577 int64_t offset = -(3 << kSRegSizeInBytesLog2) * vl;
7578 uint32_t lane0 = -7 + (3 * i);
7579 uint32_t lane1 = -8 + (3 * i);
7580 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7581 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7582 }
7583 }
7584
7585 // st2d { z31.b, z0.b }, SVE_MUL3
7586 int vl_d_mul3 = vl_d - (vl_d % 3);
7587 for (int i = 0; i < vl_d_mul3; i++) {
7588 int64_t offset = (1 << kDRegSizeInBytesLog2) * vl;
7589 uint64_t lane0 = 32 - (11 * i);
7590 uint64_t lane1 = 33 - (11 * i);
7591 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7592 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7593 }
7594
7595 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7596
Jacob Bramleye483ce52019-11-05 16:52:29 +00007597 // Check that we loaded back the expected values.
7598
7599 // st2b/ld2b
7600 ASSERT_EQUAL_SVE(z4, z20);
7601 ASSERT_EQUAL_SVE(z5, z21);
7602
7603 // st2h/ld2h
7604 ASSERT_EQUAL_SVE(z6, z22);
7605 ASSERT_EQUAL_SVE(z7, z23);
7606
7607 // st2w/ld2w
7608 ASSERT_EQUAL_SVE(z8, z24);
7609 ASSERT_EQUAL_SVE(z9, z25);
7610
7611 // st2d/ld2d
7612 ASSERT_EQUAL_SVE(z10, z26);
7613 ASSERT_EQUAL_SVE(z11, z27);
7614
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007615 delete[] expected;
7616 }
7617 delete[] data;
7618}
7619
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007620TEST_SVE(sve_ld3_st3_scalar_plus_imm) {
7621 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7622 START();
7623
7624 int vl = config->sve_vl_in_bytes();
7625
7626 // The immediate can address [-24, 21] times the VL, so allocate enough space
7627 // to exceed that in both directions.
7628 int data_size = vl * 128;
7629
7630 uint8_t* data = new uint8_t[data_size];
7631 memset(data, 0, data_size);
7632
7633 // Set the base half-way through the buffer so we can use negative indeces.
7634 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7635
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007636 // We can test ld3 by comparing the values loaded with the values stored.
7637 // There are two complications:
7638 // - Loads have zeroing predication, so we have to clear the inactive
7639 // elements on our reference.
7640 // - We want to test both loads and stores that span { z31, z0 }, so we have
7641 // to move some values around.
7642 //
7643 // Registers z4-z15 will hold as-stored values (with inactive elements
7644 // cleared). Registers z16-z27 will hold the values that were loaded.
7645
7646 __ Index(z10.VnB(), 1, -3);
7647 __ Index(z11.VnB(), 2, -3);
7648 __ Index(z12.VnB(), 3, -3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007649 __ Ptrue(p0.VnB());
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007650 __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p0, SVEMemOperand(x0));
7651 // Save the stored values for ld3 tests.
7652 __ Dup(z4.VnB(), 0);
7653 __ Dup(z5.VnB(), 0);
7654 __ Dup(z6.VnB(), 0);
7655 __ Mov(z4.VnB(), p0.Merging(), z10.VnB());
7656 __ Mov(z5.VnB(), p0.Merging(), z11.VnB());
7657 __ Mov(z6.VnB(), p0.Merging(), z12.VnB());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007658
7659 // Wrap around from z31 to z0.
7660 __ Index(z31.VnH(), -2, 5);
7661 __ Index(z0.VnH(), -3, 5);
7662 __ Index(z1.VnH(), -4, 5);
7663 __ Ptrue(p1.VnH(), SVE_MUL3);
7664 __ St3h(z31.VnH(), z0.VnH(), z1.VnH(), p1, SVEMemOperand(x0, 9, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007665 // Save the stored values for ld3 tests.
7666 __ Dup(z7.VnH(), 0);
7667 __ Dup(z8.VnH(), 0);
7668 __ Dup(z9.VnH(), 0);
7669 __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
7670 __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
7671 __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007672
7673 __ Index(z30.VnS(), 3, -7);
7674 __ Index(z31.VnS(), 4, -7);
7675 __ Index(z0.VnS(), 5, -7);
7676 __ Ptrue(p2.VnS(), SVE_POW2);
7677 __ St3w(z30.VnS(),
7678 z31.VnS(),
7679 z0.VnS(),
7680 p2,
7681 SVEMemOperand(x0, -12, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007682 // Save the stored values for ld3 tests.
7683 __ Dup(z10.VnS(), 0);
7684 __ Dup(z11.VnS(), 0);
7685 __ Dup(z12.VnS(), 0);
7686 __ Mov(z10.VnS(), p2.Merging(), z30.VnS());
7687 __ Mov(z11.VnS(), p2.Merging(), z31.VnS());
7688 __ Mov(z12.VnS(), p2.Merging(), z0.VnS());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007689
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007690 __ Index(z0.VnD(), -7, 3);
7691 __ Index(z1.VnD(), -8, 3);
7692 __ Index(z2.VnD(), -9, 3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007693 // Sparse predication, including some irrelevant bits (0xee). To make the
7694 // results easy to check, activate each lane <n> where n is a multiple of 5.
7695 Initialise(&masm,
7696 p3,
7697 0xeee10000000001ee,
7698 0xeeeeeee100000000,
7699 0x01eeeeeeeee10000,
7700 0x000001eeeeeeeee1);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007701 __ St3d(z0.VnD(), z1.VnD(), z2.VnD(), p3, SVEMemOperand(x0, 15, SVE_MUL_VL));
7702 // Save the stored values for ld3 tests.
7703 __ Dup(z13.VnD(), 0);
7704 __ Dup(z14.VnD(), 0);
7705 __ Dup(z15.VnD(), 0);
7706 __ Mov(z13.VnD(), p3.Merging(), z0.VnD());
7707 __ Mov(z14.VnD(), p3.Merging(), z1.VnD());
7708 __ Mov(z15.VnD(), p3.Merging(), z2.VnD());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007709
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007710 // Corresponding loads.
7711 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7712 __ Ld3b(z31.VnB(), z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7713 __ Mov(z16, z31);
7714 __ Mov(z17, z0);
7715 __ Mov(z18, z1);
7716 __ Ld3h(z30.VnH(),
7717 z31.VnH(),
7718 z0.VnH(),
7719 p1.Zeroing(),
7720 SVEMemOperand(x0, 9, SVE_MUL_VL));
7721 __ Mov(z19, z30);
7722 __ Mov(z20, z31);
7723 __ Mov(z21, z0);
7724 __ Ld3w(z22.VnS(),
7725 z23.VnS(),
7726 z24.VnS(),
7727 p2.Zeroing(),
7728 SVEMemOperand(x0, -12, SVE_MUL_VL));
7729 __ Ld3d(z25.VnD(),
7730 z26.VnD(),
7731 z27.VnD(),
7732 p3.Zeroing(),
7733 SVEMemOperand(x0, 15, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007734
7735 END();
7736
7737 if (CAN_RUN()) {
7738 RUN();
7739
7740 uint8_t* expected = new uint8_t[data_size];
7741 memset(expected, 0, data_size);
7742 uint8_t* middle = &expected[data_size / 2];
7743
7744 int vl_b = vl / kBRegSizeInBytes;
7745 int vl_h = vl / kHRegSizeInBytes;
7746 int vl_s = vl / kSRegSizeInBytes;
7747 int vl_d = vl / kDRegSizeInBytes;
7748
7749 int reg_count = 3;
7750
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007751 // st3b { z10.b, z11.b, z12.b }, SVE_ALL
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007752 for (int i = 0; i < vl_b; i++) {
7753 uint8_t lane0 = 1 - (3 * i);
7754 uint8_t lane1 = 2 - (3 * i);
7755 uint8_t lane2 = 3 - (3 * i);
7756 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7757 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7758 MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
7759 }
7760
7761 // st3h { z31.h, z0.h, z1.h }, SVE_MUL3
7762 int vl_h_mul3 = vl_h - (vl_h % 3);
7763 for (int i = 0; i < vl_h_mul3; i++) {
7764 int64_t offset = 9 * vl;
7765 uint16_t lane0 = -2 + (5 * i);
7766 uint16_t lane1 = -3 + (5 * i);
7767 uint16_t lane2 = -4 + (5 * i);
7768 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7769 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7770 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7771 }
7772
7773 // st3w { z30.s, z31.s, z0.s }, SVE_POW2
7774 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7775 for (int i = 0; i < vl_s_pow2; i++) {
7776 int64_t offset = -12 * vl;
7777 uint32_t lane0 = 3 - (7 * i);
7778 uint32_t lane1 = 4 - (7 * i);
7779 uint32_t lane2 = 5 - (7 * i);
7780 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7781 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7782 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7783 }
7784
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007785 // st3d { z0.d, z1.d, z2.d }, ((i % 5) == 0)
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007786 for (int i = 0; i < vl_d; i++) {
7787 if ((i % 5) == 0) {
7788 int64_t offset = 15 * vl;
7789 uint64_t lane0 = -7 + (3 * i);
7790 uint64_t lane1 = -8 + (3 * i);
7791 uint64_t lane2 = -9 + (3 * i);
7792 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7793 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7794 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7795 }
7796 }
7797
7798 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7799
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007800 // Check that we loaded back the expected values.
7801
7802 // st3b/ld3b
7803 ASSERT_EQUAL_SVE(z4, z16);
7804 ASSERT_EQUAL_SVE(z5, z17);
7805 ASSERT_EQUAL_SVE(z6, z18);
7806
7807 // st3h/ld3h
7808 ASSERT_EQUAL_SVE(z7, z19);
7809 ASSERT_EQUAL_SVE(z8, z20);
7810 ASSERT_EQUAL_SVE(z9, z21);
7811
7812 // st3w/ld3w
7813 ASSERT_EQUAL_SVE(z10, z22);
7814 ASSERT_EQUAL_SVE(z11, z23);
7815 ASSERT_EQUAL_SVE(z12, z24);
7816
7817 // st3d/ld3d
7818 ASSERT_EQUAL_SVE(z13, z25);
7819 ASSERT_EQUAL_SVE(z14, z26);
7820 ASSERT_EQUAL_SVE(z15, z27);
7821
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007822 delete[] expected;
7823 }
7824 delete[] data;
7825}
7826
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007827TEST_SVE(sve_ld3_st3_scalar_plus_scalar) {
7828 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7829 START();
7830
7831 int vl = config->sve_vl_in_bytes();
7832
7833 // Allocate plenty of space to enable indexing in both directions.
7834 int data_size = vl * 128;
7835
7836 uint8_t* data = new uint8_t[data_size];
7837 memset(data, 0, data_size);
7838
7839 // Set the base half-way through the buffer so we can use negative indeces.
7840 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7841
Jacob Bramleye483ce52019-11-05 16:52:29 +00007842 // We can test ld3 by comparing the values loaded with the values stored.
7843 // There are two complications:
7844 // - Loads have zeroing predication, so we have to clear the inactive
7845 // elements on our reference.
7846 // - We want to test both loads and stores that span { z31, z0 }, so we have
7847 // to move some values around.
7848 //
7849 // Registers z4-z15 will hold as-stored values (with inactive elements
7850 // cleared). Registers z16-z27 will hold the values that were loaded.
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007851
Jacob Bramleye483ce52019-11-05 16:52:29 +00007852 __ Index(z10.VnB(), -4, 11);
7853 __ Index(z11.VnB(), -5, 11);
7854 __ Index(z12.VnB(), -6, 11);
7855 __ Ptrue(p7.VnB(), SVE_MUL4);
7856 __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
7857 __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p7, SVEMemOperand(x0, x1, LSL, 0));
7858 // Save the stored values for ld3 tests.
7859 __ Dup(z4.VnB(), 0);
7860 __ Dup(z5.VnB(), 0);
7861 __ Dup(z6.VnB(), 0);
7862 __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7863 __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7864 __ Mov(z6.VnB(), p7.Merging(), z12.VnB());
7865
7866 __ Index(z13.VnH(), 6, -2);
7867 __ Index(z14.VnH(), 7, -2);
7868 __ Index(z15.VnH(), 8, -2);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007869 __ Ptrue(p6.VnH(), SVE_VL16);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007870 __ Rdvl(x2, 5); // (5 * vl) << 1 = 10 * vl
7871 __ St3h(z13.VnH(), z14.VnH(), z15.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
7872 // Save the stored values for ld3 tests.
7873 __ Dup(z7.VnH(), 0);
7874 __ Dup(z8.VnH(), 0);
7875 __ Dup(z9.VnH(), 0);
7876 __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7877 __ Mov(z8.VnH(), p6.Merging(), z14.VnH());
7878 __ Mov(z9.VnH(), p6.Merging(), z15.VnH());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007879
7880 // Wrap around from z31 to z0.
7881 __ Index(z30.VnS(), -7, 3);
7882 __ Index(z31.VnS(), -8, 3);
7883 __ Index(z0.VnS(), -9, 3);
7884 // Sparse predication, including some irrelevant bits (0xe). To make the
7885 // results easy to check, activate each lane <n> where n is a multiple of 5.
7886 Initialise(&masm,
7887 p5,
7888 0xeee1000010000100,
7889 0x001eeee100001000,
7890 0x0100001eeee10000,
7891 0x10000100001eeee1);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007892 __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
7893 __ St3w(z30.VnS(), z31.VnS(), z0.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
7894 // Save the stored values for ld3 tests.
7895 __ Dup(z10.VnS(), 0);
7896 __ Dup(z11.VnS(), 0);
7897 __ Dup(z12.VnS(), 0);
7898 __ Mov(z10.VnS(), p5.Merging(), z30.VnS());
7899 __ Mov(z11.VnS(), p5.Merging(), z31.VnS());
7900 __ Mov(z12.VnS(), p5.Merging(), z0.VnS());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007901
7902 __ Index(z31.VnD(), 32, -11);
7903 __ Index(z0.VnD(), 33, -11);
7904 __ Index(z1.VnD(), 34, -11);
7905 __ Ptrue(p4.VnD(), SVE_MUL3);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007906 __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 * vl
7907 __ St3d(z31.VnD(), z0.VnD(), z1.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
7908 // Save the stored values for ld3 tests.
7909 __ Dup(z13.VnD(), 0);
7910 __ Dup(z14.VnD(), 0);
7911 __ Dup(z15.VnD(), 0);
7912 __ Mov(z13.VnD(), p4.Merging(), z31.VnD());
7913 __ Mov(z14.VnD(), p4.Merging(), z0.VnD());
7914 __ Mov(z15.VnD(), p4.Merging(), z1.VnD());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007915
Jacob Bramleye483ce52019-11-05 16:52:29 +00007916 // Corresponding loads.
7917 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7918 __ Ld3b(z31.VnB(),
7919 z0.VnB(),
7920 z1.VnB(),
7921 p7.Zeroing(),
7922 SVEMemOperand(x0, x1, LSL, 0));
7923 __ Mov(z16, z31);
7924 __ Mov(z17, z0);
7925 __ Mov(z18, z1);
7926 __ Ld3h(z30.VnH(),
7927 z31.VnH(),
7928 z0.VnH(),
7929 p6.Zeroing(),
7930 SVEMemOperand(x0, x2, LSL, 1));
7931 __ Mov(z19, z30);
7932 __ Mov(z20, z31);
7933 __ Mov(z21, z0);
7934 __ Ld3w(z22.VnS(),
7935 z23.VnS(),
7936 z24.VnS(),
7937 p5.Zeroing(),
7938 SVEMemOperand(x0, x3, LSL, 2));
7939 __ Ld3d(z25.VnD(),
7940 z26.VnD(),
7941 z27.VnD(),
7942 p4.Zeroing(),
7943 SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007944
7945 END();
7946
7947 if (CAN_RUN()) {
7948 RUN();
7949
7950 uint8_t* expected = new uint8_t[data_size];
7951 memset(expected, 0, data_size);
7952 uint8_t* middle = &expected[data_size / 2];
7953
7954 int vl_b = vl / kBRegSizeInBytes;
7955 int vl_h = vl / kHRegSizeInBytes;
7956 int vl_s = vl / kSRegSizeInBytes;
7957 int vl_d = vl / kDRegSizeInBytes;
7958
7959 int reg_count = 3;
7960
Jacob Bramleye483ce52019-11-05 16:52:29 +00007961 // st3b { z10.b, z11.b, z12.b }, SVE_MUL4
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007962 int vl_b_mul4 = vl_b - (vl_b % 4);
7963 for (int i = 0; i < vl_b_mul4; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00007964 int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007965 uint8_t lane0 = -4 + (11 * i);
7966 uint8_t lane1 = -5 + (11 * i);
7967 uint8_t lane2 = -6 + (11 * i);
7968 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7969 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7970 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7971 }
7972
Jacob Bramleye483ce52019-11-05 16:52:29 +00007973 // st3h { z13.h, z14.h, z15.h }, SVE_VL16
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007974 if (vl_h >= 16) {
7975 for (int i = 0; i < 16; i++) {
7976 int64_t offset = (5 << kHRegSizeInBytesLog2) * vl;
7977 uint16_t lane0 = 6 - (2 * i);
7978 uint16_t lane1 = 7 - (2 * i);
7979 uint16_t lane2 = 8 - (2 * i);
7980 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7981 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7982 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7983 }
7984 }
7985
7986 // st3w { z30.s, z31.s, z0.s }, ((i % 5) == 0)
7987 for (int i = 0; i < vl_s; i++) {
7988 if ((i % 5) == 0) {
7989 int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
7990 uint32_t lane0 = -7 + (3 * i);
7991 uint32_t lane1 = -8 + (3 * i);
7992 uint32_t lane2 = -9 + (3 * i);
7993 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7994 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7995 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7996 }
7997 }
7998
7999 // st3d { z31.d, z0.d, z1.d }, SVE_MUL3
8000 int vl_d_mul3 = vl_d - (vl_d % 3);
8001 for (int i = 0; i < vl_d_mul3; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00008002 int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008003 uint64_t lane0 = 32 - (11 * i);
8004 uint64_t lane1 = 33 - (11 * i);
8005 uint64_t lane2 = 34 - (11 * i);
8006 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8007 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8008 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8009 }
8010
8011 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8012
Jacob Bramleye483ce52019-11-05 16:52:29 +00008013 // Check that we loaded back the expected values.
8014
8015 // st3b/ld3b
8016 ASSERT_EQUAL_SVE(z4, z16);
8017 ASSERT_EQUAL_SVE(z5, z17);
8018 ASSERT_EQUAL_SVE(z6, z18);
8019
8020 // st3h/ld3h
8021 ASSERT_EQUAL_SVE(z7, z19);
8022 ASSERT_EQUAL_SVE(z8, z20);
8023 ASSERT_EQUAL_SVE(z9, z21);
8024
8025 // st3w/ld3w
8026 ASSERT_EQUAL_SVE(z10, z22);
8027 ASSERT_EQUAL_SVE(z11, z23);
8028 ASSERT_EQUAL_SVE(z12, z24);
8029
8030 // st3d/ld3d
8031 ASSERT_EQUAL_SVE(z13, z25);
8032 ASSERT_EQUAL_SVE(z14, z26);
8033 ASSERT_EQUAL_SVE(z15, z27);
8034
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008035 delete[] expected;
8036 }
8037 delete[] data;
8038}
8039
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008040TEST_SVE(sve_ld4_st4_scalar_plus_imm) {
8041 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8042 START();
8043
8044 int vl = config->sve_vl_in_bytes();
8045
8046 // The immediate can address [-24, 21] times the VL, so allocate enough space
8047 // to exceed that in both directions.
8048 int data_size = vl * 128;
8049
8050 uint8_t* data = new uint8_t[data_size];
8051 memset(data, 0, data_size);
8052
8053 // Set the base half-way through the buffer so we can use negative indeces.
8054 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8055
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008056 // We can test ld4 by comparing the values loaded with the values stored.
8057 // There are two complications:
8058 // - Loads have zeroing predication, so we have to clear the inactive
8059 // elements on our reference.
8060 // - We want to test both loads and stores that span { z31, z0 }, so we have
8061 // to move some values around.
8062 //
8063 // Registers z3-z18 will hold as-stored values (with inactive elements
8064 // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8065 // loaded.
8066
8067 __ Index(z10.VnB(), 1, -7);
8068 __ Index(z11.VnB(), 2, -7);
8069 __ Index(z12.VnB(), 3, -7);
8070 __ Index(z13.VnB(), 4, -7);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008071 __ Ptrue(p0.VnB());
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008072 __ St4b(z10.VnB(), z11.VnB(), z12.VnB(), z13.VnB(), p0, SVEMemOperand(x0));
8073 // Save the stored values for ld4 tests.
8074 __ Dup(z3.VnB(), 0);
8075 __ Dup(z4.VnB(), 0);
8076 __ Dup(z5.VnB(), 0);
8077 __ Dup(z6.VnB(), 0);
8078 __ Mov(z3.VnB(), p0.Merging(), z10.VnB());
8079 __ Mov(z4.VnB(), p0.Merging(), z11.VnB());
8080 __ Mov(z5.VnB(), p0.Merging(), z12.VnB());
8081 __ Mov(z6.VnB(), p0.Merging(), z13.VnB());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008082
8083 // Wrap around from z31 to z0.
8084 __ Index(z31.VnH(), -2, 5);
8085 __ Index(z0.VnH(), -3, 5);
8086 __ Index(z1.VnH(), -4, 5);
8087 __ Index(z2.VnH(), -5, 5);
8088 __ Ptrue(p1.VnH(), SVE_MUL3);
8089 __ St4h(z31.VnH(),
8090 z0.VnH(),
8091 z1.VnH(),
8092 z2.VnH(),
8093 p1,
8094 SVEMemOperand(x0, 4, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008095 // Save the stored values for ld4 tests.
8096 __ Dup(z7.VnH(), 0);
8097 __ Dup(z8.VnH(), 0);
8098 __ Dup(z9.VnH(), 0);
8099 __ Dup(z10.VnH(), 0);
8100 __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
8101 __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
8102 __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
8103 __ Mov(z10.VnH(), p1.Merging(), z2.VnH());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008104
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008105 // Wrap around from z31 to z0.
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008106 __ Index(z29.VnS(), 2, -7);
8107 __ Index(z30.VnS(), 3, -7);
8108 __ Index(z31.VnS(), 4, -7);
8109 __ Index(z0.VnS(), 5, -7);
8110 __ Ptrue(p2.VnS(), SVE_POW2);
8111 __ St4w(z29.VnS(),
8112 z30.VnS(),
8113 z31.VnS(),
8114 z0.VnS(),
8115 p2,
8116 SVEMemOperand(x0, -12, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008117 // Save the stored values for ld4 tests.
8118 __ Dup(z11.VnS(), 0);
8119 __ Dup(z12.VnS(), 0);
8120 __ Dup(z13.VnS(), 0);
8121 __ Dup(z14.VnS(), 0);
8122 __ Mov(z11.VnS(), p2.Merging(), z29.VnS());
8123 __ Mov(z12.VnS(), p2.Merging(), z30.VnS());
8124 __ Mov(z13.VnS(), p2.Merging(), z31.VnS());
8125 __ Mov(z14.VnS(), p2.Merging(), z0.VnS());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008126
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008127 __ Index(z20.VnD(), -7, 8);
8128 __ Index(z21.VnD(), -8, 8);
8129 __ Index(z22.VnD(), -9, 8);
8130 __ Index(z23.VnD(), -10, 8);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008131 // Sparse predication, including some irrelevant bits (0xee). To make the
8132 // results easy to check, activate each lane <n> where n is a multiple of 5.
8133 Initialise(&masm,
8134 p3,
8135 0xeee10000000001ee,
8136 0xeeeeeee100000000,
8137 0x01eeeeeeeee10000,
8138 0x000001eeeeeeeee1);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008139 __ St4d(z20.VnD(),
8140 z21.VnD(),
8141 z22.VnD(),
8142 z23.VnD(),
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008143 p3,
8144 SVEMemOperand(x0, 16, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008145 // Save the stored values for ld4 tests.
8146 __ Dup(z15.VnD(), 0);
8147 __ Dup(z16.VnD(), 0);
8148 __ Dup(z17.VnD(), 0);
8149 __ Dup(z18.VnD(), 0);
8150 __ Mov(z15.VnD(), p3.Merging(), z20.VnD());
8151 __ Mov(z16.VnD(), p3.Merging(), z21.VnD());
8152 __ Mov(z17.VnD(), p3.Merging(), z22.VnD());
8153 __ Mov(z18.VnD(), p3.Merging(), z23.VnD());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008154
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008155 // Corresponding loads.
8156 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8157 __ Ld4b(z31.VnB(),
8158 z0.VnB(),
8159 z1.VnB(),
8160 z2.VnB(),
8161 p0.Zeroing(),
8162 SVEMemOperand(x0));
8163 __ Mov(z19, z31);
8164 __ Mov(z20, z0);
8165 __ Mov(z21, z1);
8166 __ Mov(z22, z2);
8167 __ Ld4h(z23.VnH(),
8168 z24.VnH(),
8169 z25.VnH(),
8170 z26.VnH(),
8171 p1.Zeroing(),
8172 SVEMemOperand(x0, 4, SVE_MUL_VL));
8173 __ Ld4w(z27.VnS(),
8174 z28.VnS(),
8175 z29.VnS(),
8176 z30.VnS(),
8177 p2.Zeroing(),
8178 SVEMemOperand(x0, -12, SVE_MUL_VL));
8179 // Wrap around from z31 to z0.
8180 __ Ld4d(z31.VnD(),
8181 z0.VnD(),
8182 z1.VnD(),
8183 z2.VnD(),
8184 p3.Zeroing(),
8185 SVEMemOperand(x0, 16, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008186
8187 END();
8188
8189 if (CAN_RUN()) {
8190 RUN();
8191
8192 uint8_t* expected = new uint8_t[data_size];
8193 memset(expected, 0, data_size);
8194 uint8_t* middle = &expected[data_size / 2];
8195
8196 int vl_b = vl / kBRegSizeInBytes;
8197 int vl_h = vl / kHRegSizeInBytes;
8198 int vl_s = vl / kSRegSizeInBytes;
8199 int vl_d = vl / kDRegSizeInBytes;
8200
8201 int reg_count = 4;
8202
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008203 // st2b { z10.b, z11.b, z12.b, z13.b }, SVE_ALL
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008204 for (int i = 0; i < vl_b; i++) {
8205 uint8_t lane0 = 1 - (7 * i);
8206 uint8_t lane1 = 2 - (7 * i);
8207 uint8_t lane2 = 3 - (7 * i);
8208 uint8_t lane3 = 4 - (7 * i);
8209 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
8210 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
8211 MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
8212 MemoryWrite(middle, 0, (i * reg_count) + 3, lane3);
8213 }
8214
8215 // st4h { z31.h, z0.h, z1.h, z2.h }, SVE_MUL3
8216 int vl_h_mul3 = vl_h - (vl_h % 3);
8217 for (int i = 0; i < vl_h_mul3; i++) {
8218 int64_t offset = 4 * vl;
8219 uint16_t lane0 = -2 + (5 * i);
8220 uint16_t lane1 = -3 + (5 * i);
8221 uint16_t lane2 = -4 + (5 * i);
8222 uint16_t lane3 = -5 + (5 * i);
8223 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8224 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8225 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8226 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8227 }
8228
8229 // st4w { z29.s, z30.s, z31.s, z0.s }, SVE_POW2
8230 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
8231 for (int i = 0; i < vl_s_pow2; i++) {
8232 int64_t offset = -12 * vl;
8233 uint32_t lane0 = 2 - (7 * i);
8234 uint32_t lane1 = 3 - (7 * i);
8235 uint32_t lane2 = 4 - (7 * i);
8236 uint32_t lane3 = 5 - (7 * i);
8237 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8238 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8239 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8240 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8241 }
8242
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008243 // st4d { z20.d, z21.d, z22.d, z23.d }, ((i % 5) == 0)
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008244 for (int i = 0; i < vl_d; i++) {
8245 if ((i % 5) == 0) {
8246 int64_t offset = 16 * vl;
8247 uint64_t lane0 = -7 + (8 * i);
8248 uint64_t lane1 = -8 + (8 * i);
8249 uint64_t lane2 = -9 + (8 * i);
8250 uint64_t lane3 = -10 + (8 * i);
8251 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8252 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8253 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8254 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8255 }
8256 }
8257
8258 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8259
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008260 // Check that we loaded back the expected values.
8261
8262 // st4b/ld4b
8263 ASSERT_EQUAL_SVE(z3, z19);
8264 ASSERT_EQUAL_SVE(z4, z20);
8265 ASSERT_EQUAL_SVE(z5, z21);
8266 ASSERT_EQUAL_SVE(z6, z22);
8267
8268 // st4h/ld4h
8269 ASSERT_EQUAL_SVE(z7, z23);
8270 ASSERT_EQUAL_SVE(z8, z24);
8271 ASSERT_EQUAL_SVE(z9, z25);
8272 ASSERT_EQUAL_SVE(z10, z26);
8273
8274 // st4w/ld4w
8275 ASSERT_EQUAL_SVE(z11, z27);
8276 ASSERT_EQUAL_SVE(z12, z28);
8277 ASSERT_EQUAL_SVE(z13, z29);
8278 ASSERT_EQUAL_SVE(z14, z30);
8279
8280 // st4d/ld4d
8281 ASSERT_EQUAL_SVE(z15, z31);
8282 ASSERT_EQUAL_SVE(z16, z0);
8283 ASSERT_EQUAL_SVE(z17, z1);
8284 ASSERT_EQUAL_SVE(z18, z2);
8285
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008286 delete[] expected;
8287 }
8288 delete[] data;
8289}
8290
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008291TEST_SVE(sve_ld4_st4_scalar_plus_scalar) {
8292 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8293 START();
8294
8295 int vl = config->sve_vl_in_bytes();
8296
8297 // Allocate plenty of space to enable indexing in both directions.
8298 int data_size = vl * 128;
8299
8300 uint8_t* data = new uint8_t[data_size];
8301 memset(data, 0, data_size);
8302
8303 // Set the base half-way through the buffer so we can use negative indeces.
8304 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8305
Jacob Bramleye483ce52019-11-05 16:52:29 +00008306 // We can test ld4 by comparing the values loaded with the values stored.
8307 // There are two complications:
8308 // - Loads have zeroing predication, so we have to clear the inactive
8309 // elements on our reference.
8310 // - We want to test both loads and stores that span { z31, z0 }, so we have
8311 // to move some values around.
8312 //
8313 // Registers z3-z18 will hold as-stored values (with inactive elements
8314 // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8315 // loaded.
8316
8317 __ Index(z19.VnB(), -4, 11);
8318 __ Index(z20.VnB(), -5, 11);
8319 __ Index(z21.VnB(), -6, 11);
8320 __ Index(z22.VnB(), -7, 11);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008321 __ Ptrue(p7.VnB(), SVE_MUL4);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008322 __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
8323 __ St4b(z19.VnB(),
8324 z20.VnB(),
8325 z21.VnB(),
8326 z22.VnB(),
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008327 p7,
8328 SVEMemOperand(x0, x1, LSL, 0));
Jacob Bramleye483ce52019-11-05 16:52:29 +00008329 // Save the stored values for ld4 tests.
8330 __ Dup(z3.VnB(), 0);
8331 __ Dup(z4.VnB(), 0);
8332 __ Dup(z5.VnB(), 0);
8333 __ Dup(z6.VnB(), 0);
8334 __ Mov(z3.VnB(), p7.Merging(), z19.VnB());
8335 __ Mov(z4.VnB(), p7.Merging(), z20.VnB());
8336 __ Mov(z5.VnB(), p7.Merging(), z21.VnB());
8337 __ Mov(z6.VnB(), p7.Merging(), z22.VnB());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008338
Jacob Bramleye483ce52019-11-05 16:52:29 +00008339 __ Index(z23.VnH(), 6, -2);
8340 __ Index(z24.VnH(), 7, -2);
8341 __ Index(z25.VnH(), 8, -2);
8342 __ Index(z26.VnH(), 9, -2);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008343 __ Ptrue(p6.VnH(), SVE_VL16);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008344 __ Rdvl(x2, 7); // (7 * vl) << 1 = 14 * vl
8345 __ St4h(z23.VnH(),
8346 z24.VnH(),
8347 z25.VnH(),
8348 z26.VnH(),
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008349 p6,
8350 SVEMemOperand(x0, x2, LSL, 1));
Jacob Bramleye483ce52019-11-05 16:52:29 +00008351 // Save the stored values for ld4 tests.
8352 __ Dup(z7.VnH(), 0);
8353 __ Dup(z8.VnH(), 0);
8354 __ Dup(z9.VnH(), 0);
8355 __ Dup(z10.VnH(), 0);
8356 __ Mov(z7.VnH(), p6.Merging(), z23.VnH());
8357 __ Mov(z8.VnH(), p6.Merging(), z24.VnH());
8358 __ Mov(z9.VnH(), p6.Merging(), z25.VnH());
8359 __ Mov(z10.VnH(), p6.Merging(), z26.VnH());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008360
8361 // Wrap around from z31 to z0.
8362 __ Index(z29.VnS(), -6, 7);
8363 __ Index(z30.VnS(), -7, 7);
8364 __ Index(z31.VnS(), -8, 7);
8365 __ Index(z0.VnS(), -9, 7);
8366 // Sparse predication, including some irrelevant bits (0xe). To make the
8367 // results easy to check, activate each lane <n> where n is a multiple of 5.
8368 Initialise(&masm,
8369 p5,
8370 0xeee1000010000100,
8371 0x001eeee100001000,
8372 0x0100001eeee10000,
8373 0x10000100001eeee1);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008374 __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008375 __ St4w(z29.VnS(),
8376 z30.VnS(),
8377 z31.VnS(),
8378 z0.VnS(),
8379 p5,
Jacob Bramleye483ce52019-11-05 16:52:29 +00008380 SVEMemOperand(x0, x3, LSL, 2));
8381 // Save the stored values for ld4 tests.
8382 __ Dup(z11.VnS(), 0);
8383 __ Dup(z12.VnS(), 0);
8384 __ Dup(z13.VnS(), 0);
8385 __ Dup(z14.VnS(), 0);
8386 __ Mov(z11.VnS(), p5.Merging(), z29.VnS());
8387 __ Mov(z12.VnS(), p5.Merging(), z30.VnS());
8388 __ Mov(z13.VnS(), p5.Merging(), z31.VnS());
8389 __ Mov(z14.VnS(), p5.Merging(), z0.VnS());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008390
8391 __ Index(z31.VnD(), 32, -11);
8392 __ Index(z0.VnD(), 33, -11);
8393 __ Index(z1.VnD(), 34, -11);
8394 __ Index(z2.VnD(), 35, -11);
8395 __ Ptrue(p4.VnD(), SVE_MUL3);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008396 __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 *vl
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008397 __ St4d(z31.VnD(),
8398 z0.VnD(),
8399 z1.VnD(),
8400 z2.VnD(),
8401 p4,
Jacob Bramleye483ce52019-11-05 16:52:29 +00008402 SVEMemOperand(x0, x4, LSL, 3));
8403 // Save the stored values for ld4 tests.
8404 __ Dup(z15.VnD(), 0);
8405 __ Dup(z16.VnD(), 0);
8406 __ Dup(z17.VnD(), 0);
8407 __ Dup(z18.VnD(), 0);
8408 __ Mov(z15.VnD(), p4.Merging(), z31.VnD());
8409 __ Mov(z16.VnD(), p4.Merging(), z0.VnD());
8410 __ Mov(z17.VnD(), p4.Merging(), z1.VnD());
8411 __ Mov(z18.VnD(), p4.Merging(), z2.VnD());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008412
Jacob Bramleye483ce52019-11-05 16:52:29 +00008413 // Corresponding loads.
8414 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8415 __ Ld4b(z31.VnB(),
8416 z0.VnB(),
8417 z1.VnB(),
8418 z2.VnB(),
8419 p7.Zeroing(),
8420 SVEMemOperand(x0, x1, LSL, 0));
8421 __ Mov(z19, z31);
8422 __ Mov(z20, z0);
8423 __ Mov(z21, z1);
8424 __ Mov(z22, z2);
8425 __ Ld4h(z23.VnH(),
8426 z24.VnH(),
8427 z25.VnH(),
8428 z26.VnH(),
8429 p6.Zeroing(),
8430 SVEMemOperand(x0, x2, LSL, 1));
8431 __ Ld4w(z27.VnS(),
8432 z28.VnS(),
8433 z29.VnS(),
8434 z30.VnS(),
8435 p5.Zeroing(),
8436 SVEMemOperand(x0, x3, LSL, 2));
8437 // Wrap around from z31 to z0.
8438 __ Ld4d(z31.VnD(),
8439 z0.VnD(),
8440 z1.VnD(),
8441 z2.VnD(),
8442 p4.Zeroing(),
8443 SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008444
8445 END();
8446
8447 if (CAN_RUN()) {
8448 RUN();
8449
8450 uint8_t* expected = new uint8_t[data_size];
8451 memset(expected, 0, data_size);
8452 uint8_t* middle = &expected[data_size / 2];
8453
8454 int vl_b = vl / kBRegSizeInBytes;
8455 int vl_h = vl / kHRegSizeInBytes;
8456 int vl_s = vl / kSRegSizeInBytes;
8457 int vl_d = vl / kDRegSizeInBytes;
8458
8459 int reg_count = 4;
8460
Jacob Bramleye483ce52019-11-05 16:52:29 +00008461 // st4b { z19.b, z20.b, z21.b, z22.b }, SVE_MUL4
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008462 int vl_b_mul4 = vl_b - (vl_b % 4);
8463 for (int i = 0; i < vl_b_mul4; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00008464 int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008465 uint8_t lane0 = -4 + (11 * i);
8466 uint8_t lane1 = -5 + (11 * i);
8467 uint8_t lane2 = -6 + (11 * i);
8468 uint8_t lane3 = -7 + (11 * i);
8469 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8470 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8471 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8472 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8473 }
8474
Jacob Bramleye483ce52019-11-05 16:52:29 +00008475 // st4h { z22.h, z23.h, z24.h, z25.h }, SVE_VL16
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008476 if (vl_h >= 16) {
8477 for (int i = 0; i < 16; i++) {
8478 int64_t offset = (7 << kHRegSizeInBytesLog2) * vl;
8479 uint16_t lane0 = 6 - (2 * i);
8480 uint16_t lane1 = 7 - (2 * i);
8481 uint16_t lane2 = 8 - (2 * i);
8482 uint16_t lane3 = 9 - (2 * i);
8483 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8484 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8485 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8486 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8487 }
8488 }
8489
8490 // st4w { z29.s, z30.s, z31.s, z0.s }, ((i % 5) == 0)
8491 for (int i = 0; i < vl_s; i++) {
8492 if ((i % 5) == 0) {
8493 int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
8494 uint32_t lane0 = -6 + (7 * i);
8495 uint32_t lane1 = -7 + (7 * i);
8496 uint32_t lane2 = -8 + (7 * i);
8497 uint32_t lane3 = -9 + (7 * i);
8498 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8499 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8500 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8501 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8502 }
8503 }
8504
8505 // st4d { z31.d, z0.d, z1.d, z2.d }, SVE_MUL3
8506 int vl_d_mul3 = vl_d - (vl_d % 3);
8507 for (int i = 0; i < vl_d_mul3; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00008508 int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008509 uint64_t lane0 = 32 - (11 * i);
8510 uint64_t lane1 = 33 - (11 * i);
8511 uint64_t lane2 = 34 - (11 * i);
8512 uint64_t lane3 = 35 - (11 * i);
8513 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8514 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8515 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8516 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8517 }
8518
8519 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8520
Jacob Bramleye483ce52019-11-05 16:52:29 +00008521 // Check that we loaded back the expected values.
8522
8523 // st4b/ld4b
8524 ASSERT_EQUAL_SVE(z3, z19);
8525 ASSERT_EQUAL_SVE(z4, z20);
8526 ASSERT_EQUAL_SVE(z5, z21);
8527 ASSERT_EQUAL_SVE(z6, z22);
8528
8529 // st4h/ld4h
8530 ASSERT_EQUAL_SVE(z7, z23);
8531 ASSERT_EQUAL_SVE(z8, z24);
8532 ASSERT_EQUAL_SVE(z9, z25);
8533 ASSERT_EQUAL_SVE(z10, z26);
8534
8535 // st4w/ld4w
8536 ASSERT_EQUAL_SVE(z11, z27);
8537 ASSERT_EQUAL_SVE(z12, z28);
8538 ASSERT_EQUAL_SVE(z13, z29);
8539 ASSERT_EQUAL_SVE(z14, z30);
8540
8541 // st4d/ld4d
8542 ASSERT_EQUAL_SVE(z15, z31);
8543 ASSERT_EQUAL_SVE(z16, z0);
8544 ASSERT_EQUAL_SVE(z17, z1);
8545 ASSERT_EQUAL_SVE(z18, z2);
8546
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008547 delete[] expected;
8548 }
8549 delete[] data;
8550}
8551
8552TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp) {
8553 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8554 START();
8555
8556 // Check that the simulator correctly interprets rn == 31 as sp.
8557 // The indexing logic is the same regardless so we just check one load and
8558 // store of each type.
8559
8560 // There are no pre- or post-indexing modes, so reserve space first.
8561 __ ClaimVL(2 + 3 + 4);
8562
8563 __ Index(z0.VnB(), 42, 2);
8564 __ Index(z1.VnB(), 43, 2);
8565 __ Ptrue(p0.VnB(), SVE_VL7);
8566 __ Rdvl(x0, 0);
8567 __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, x0));
8568
8569 __ Index(z4.VnH(), 42, 3);
8570 __ Index(z5.VnH(), 43, 3);
8571 __ Index(z6.VnH(), 44, 3);
8572 __ Ptrue(p1.VnH(), SVE_POW2);
8573 __ Rdvl(x1, 2);
8574 __ Lsr(x1, x1, 1);
8575 __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, x1, LSL, 1));
8576
8577 __ Index(z8.VnS(), 42, 4);
8578 __ Index(z9.VnS(), 43, 4);
8579 __ Index(z10.VnS(), 44, 4);
8580 __ Index(z11.VnS(), 45, 4);
8581 __ Ptrue(p2.VnS());
8582 __ Rdvl(x2, 2 + 3);
8583 __ Lsr(x2, x2, 2);
8584 __ St4w(z8.VnS(),
8585 z9.VnS(),
8586 z10.VnS(),
8587 z11.VnS(),
8588 p2,
8589 SVEMemOperand(sp, x2, LSL, 2));
8590
Jacob Bramleye483ce52019-11-05 16:52:29 +00008591 // Corresponding loads.
8592 // We have to explicitly zero inactive lanes in the reference values because
8593 // loads have zeroing predication.
8594 __ Dup(z12.VnB(), 0);
8595 __ Dup(z13.VnB(), 0);
8596 __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8597 __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8598 __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, x0));
8599
8600 __ Dup(z16.VnH(), 0);
8601 __ Dup(z17.VnH(), 0);
8602 __ Dup(z18.VnH(), 0);
8603 __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8604 __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8605 __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8606 __ Ld3h(z4.VnH(),
8607 z5.VnH(),
8608 z6.VnH(),
8609 p1.Zeroing(),
8610 SVEMemOperand(sp, x1, LSL, 1));
8611
8612 __ Dup(z20.VnS(), 0);
8613 __ Dup(z21.VnS(), 0);
8614 __ Dup(z22.VnS(), 0);
8615 __ Dup(z23.VnS(), 0);
8616 __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8617 __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8618 __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8619 __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8620 __ Ld4w(z8.VnS(),
8621 z9.VnS(),
8622 z10.VnS(),
8623 z11.VnS(),
8624 p2.Zeroing(),
8625 SVEMemOperand(sp, x2, LSL, 2));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008626
8627 __ DropVL(2 + 3 + 4);
8628
8629 END();
8630
8631 if (CAN_RUN()) {
8632 RUN();
8633
8634 // The most likely failure mode is the that simulator reads sp as xzr and
8635 // crashes on execution. We already test the address calculations separately
8636 // and sp doesn't change this, so just test that we load the values we
8637 // stored.
Jacob Bramleye483ce52019-11-05 16:52:29 +00008638
8639 // st2b/ld2b
8640 ASSERT_EQUAL_SVE(z0, z12);
8641 ASSERT_EQUAL_SVE(z1, z13);
8642
8643 // st3h/ld3h
8644 ASSERT_EQUAL_SVE(z4, z16);
8645 ASSERT_EQUAL_SVE(z5, z17);
8646 ASSERT_EQUAL_SVE(z6, z18);
8647
8648 // st4h/ld4h
8649 ASSERT_EQUAL_SVE(z8, z20);
8650 ASSERT_EQUAL_SVE(z9, z21);
8651 ASSERT_EQUAL_SVE(z10, z22);
8652 ASSERT_EQUAL_SVE(z11, z23);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008653 }
8654}
8655
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008656TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp) {
8657 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8658 START();
8659
8660 // Check that the simulator correctly interprets rn == 31 as sp.
8661 // The indexing logic is the same regardless so we just check one load and
8662 // store of each type.
8663
8664 // There are no pre- or post-indexing modes, so reserve space first.
8665 // Note that the stores fill in an order that allows each immediate to be a
8666 // multiple of the number of registers.
8667 __ ClaimVL(4 + 2 + 3);
8668
8669 __ Index(z0.VnB(), 42, 2);
8670 __ Index(z1.VnB(), 43, 2);
8671 __ Ptrue(p0.VnB(), SVE_POW2);
8672 __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, 4, SVE_MUL_VL));
8673
8674 __ Index(z4.VnH(), 42, 3);
8675 __ Index(z5.VnH(), 43, 3);
8676 __ Index(z6.VnH(), 44, 3);
8677 __ Ptrue(p1.VnH(), SVE_VL7);
8678 __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, 6, SVE_MUL_VL));
8679
8680 __ Index(z8.VnS(), 42, 4);
8681 __ Index(z9.VnS(), 43, 4);
8682 __ Index(z10.VnS(), 44, 4);
8683 __ Index(z11.VnS(), 45, 4);
8684 __ Ptrue(p2.VnS());
8685 __ St4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2, SVEMemOperand(sp));
8686
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008687 // Corresponding loads.
8688 // We have to explicitly zero inactive lanes in the reference values because
8689 // loads have zeroing predication.
8690 __ Dup(z12.VnB(), 0);
8691 __ Dup(z13.VnB(), 0);
8692 __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8693 __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8694 __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, 4, SVE_MUL_VL));
8695
8696 __ Dup(z16.VnH(), 0);
8697 __ Dup(z17.VnH(), 0);
8698 __ Dup(z18.VnH(), 0);
8699 __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8700 __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8701 __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8702 __ Ld3h(z4.VnH(),
8703 z5.VnH(),
8704 z6.VnH(),
8705 p1.Zeroing(),
8706 SVEMemOperand(sp, 6, SVE_MUL_VL));
8707
8708 __ Dup(z20.VnS(), 0);
8709 __ Dup(z21.VnS(), 0);
8710 __ Dup(z22.VnS(), 0);
8711 __ Dup(z23.VnS(), 0);
8712 __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8713 __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8714 __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8715 __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8716 __ Ld4w(z8.VnS(),
8717 z9.VnS(),
8718 z10.VnS(),
8719 z11.VnS(),
8720 p2.Zeroing(),
8721 SVEMemOperand(sp));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008722
8723 __ DropVL(4 + 2 + 3);
8724
8725 END();
8726
8727 if (CAN_RUN()) {
8728 RUN();
8729
8730 // The most likely failure mode is the that simulator reads sp as xzr and
8731 // crashes on execution. We already test the address calculations separately
8732 // and sp doesn't change this, so just test that we load the values we
8733 // stored.
8734 // TODO: Actually do this, once loads are implemented.
8735 }
8736}
8737
TatWai Chong85e15102020-05-04 21:00:40 -07008738// Fill the input buffer with arbitrary data. Meanwhile, assign random offsets
8739// from the base address of the buffer and corresponding addresses to the
8740// arguments if provided.
8741static void BufferFillingHelper(uint64_t data_ptr,
8742 size_t buffer_size,
8743 unsigned lane_size_in_bytes,
8744 int lane_count,
8745 uint64_t* offsets,
8746 uint64_t* addresses = nullptr,
8747 uint64_t* max_address = nullptr) {
8748 // Use a fixed seed for nrand48() so that test runs are reproducible.
8749 unsigned short seed[3] = {1, 2, 3}; // NOLINT(runtime/int)
8750
8751 // Fill a buffer with arbitrary data.
8752 for (size_t i = 0; i < buffer_size; i++) {
8753 uint8_t byte = nrand48(seed) & 0xff;
8754 memcpy(reinterpret_cast<void*>(data_ptr + i), &byte, 1);
8755 }
8756
8757 if (max_address != nullptr) {
8758 *max_address = 0;
8759 }
8760
8761 // Vectors of random addresses and offsets into the buffer.
8762 for (int i = 0; i < lane_count; i++) {
8763 uint64_t rnd = nrand48(seed);
8764 // Limit the range to the set of completely-accessible elements in memory.
8765 offsets[i] = rnd % (buffer_size - lane_size_in_bytes);
8766 if ((addresses != nullptr) && (max_address != nullptr)) {
8767 addresses[i] = data_ptr + offsets[i];
8768 *max_address = std::max(*max_address, addresses[i]);
8769 }
8770 }
8771}
8772
TatWai Chong85e15102020-05-04 21:00:40 -07008773static void ScalarLoadHelper(MacroAssembler* masm,
8774 Register dst,
8775 Register addr,
8776 int msize_in_bits,
8777 bool is_signed) {
8778 if (is_signed) {
8779 switch (msize_in_bits) {
8780 case kBRegSize:
8781 masm->Ldrsb(dst, MemOperand(addr));
8782 break;
8783 case kHRegSize:
8784 masm->Ldrsh(dst, MemOperand(addr));
8785 break;
8786 case kWRegSize:
8787 masm->Ldrsw(dst, MemOperand(addr));
8788 break;
8789 default:
8790 VIXL_UNIMPLEMENTED();
8791 break;
8792 }
8793 } else {
8794 switch (msize_in_bits) {
8795 case kBRegSize:
8796 masm->Ldrb(dst, MemOperand(addr));
8797 break;
8798 case kHRegSize:
8799 masm->Ldrh(dst, MemOperand(addr));
8800 break;
8801 case kWRegSize:
8802 masm->Ldr(dst.W(), MemOperand(addr));
8803 break;
8804 case kXRegSize:
8805 masm->Ldr(dst, MemOperand(addr));
8806 break;
8807 default:
8808 VIXL_UNIMPLEMENTED();
8809 break;
8810 }
8811 }
8812}
8813
8814// Generate a reference result using scalar loads.
8815// For now this helper doesn't save and restore the caller registers.
8816// Clobber register z30, x28, x29 and p7.
8817template <size_t N>
8818static void ScalarLoadHelper(MacroAssembler* masm,
8819 int vl,
8820 const uint64_t (&addresses)[N],
8821 const ZRegister& zt_ref,
8822 const PRegisterZ& pg,
8823 unsigned esize_in_bits,
8824 unsigned msize_in_bits,
8825 bool is_signed) {
8826 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8827 ZRegister lane_numbers = z30.WithLaneSize(esize_in_bits);
8828 masm->Index(lane_numbers, 0, 1);
8829 masm->Dup(zt_ref, 0);
8830 for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
8831 masm->Mov(x29, addresses[N - i - 1]);
8832 Register rt(28, std::min(std::max(esize_in_bits, kSRegSize), kDRegSize));
8833 ScalarLoadHelper(masm, rt, x29, msize_in_bits, is_signed);
8834
8835 // Emulate predication.
8836 masm->Cmpeq(p7.WithLaneSize(esize_in_bits), pg, lane_numbers, i);
8837 masm->Cpy(zt_ref, p7.Merging(), rt);
8838 }
8839}
8840
TatWai Chong113d9192020-05-19 01:02:36 -07008841typedef void (MacroAssembler::*Ld1Macro)(const ZRegister& zt,
8842 const PRegisterZ& pg,
8843 const SVEMemOperand& addr);
8844
Martyn Capewella5112342020-06-05 18:20:11 +01008845template <typename T>
TatWai Chong6537a9a2020-05-05 14:15:16 -07008846static void Ldff1Helper(Test* config,
8847 uintptr_t data,
8848 unsigned msize_in_bits,
8849 unsigned esize_in_bits,
TatWai Chong1af34f12020-06-01 20:54:06 -07008850 CPURegister::RegisterType base_type,
TatWai Chong6537a9a2020-05-05 14:15:16 -07008851 Ld1Macro ldff1,
8852 Ld1Macro ld1,
Martyn Capewella5112342020-06-05 18:20:11 +01008853 T mod,
TatWai Chong6537a9a2020-05-05 14:15:16 -07008854 bool scale = false) {
8855 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8856 START();
8857
8858 int vl = config->sve_vl_in_bytes();
8859 size_t page_size = sysconf(_SC_PAGE_SIZE);
8860 VIXL_ASSERT(page_size > static_cast<size_t>(vl));
8861
8862 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8863 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
8864 unsigned msize_in_bytes_log2 = std::log2(msize_in_bytes);
8865 VIXL_ASSERT(msize_in_bits <= esize_in_bits);
8866
8867 PRegister all = p7;
8868 __ Ptrue(all.VnB());
8869
8870 size_t offset_modifier = 0;
8871
Martyn Capewell5f9b3802020-03-24 16:16:36 +00008872 // The highest address at which a load stopped. Every FF load should fault at
TatWai Chong6537a9a2020-05-05 14:15:16 -07008873 // `data + page_size`, so this value should not exceed that value. However,
8874 // the architecture allows fault-tolerant loads to fault arbitrarily, so the
8875 // real value may be lower.
8876 //
8877 // This is used to check that the `mprotect` above really does make the second
8878 // page inaccessible, and that the resulting FFR from each load reflects that.
8879 Register limit = x22;
8880 __ Mov(limit, 0);
8881
8882 // If the FFR grows unexpectedly, we increment this register by the
8883 // difference. FFR should never grow, except when explicitly set.
8884 Register ffr_grow_count = x23;
8885 __ Mov(ffr_grow_count, 0);
8886
8887 // Set the offset so that the load is guaranteed to start in the
8888 // accessible page, but end in the inaccessible one.
8889 VIXL_ASSERT((page_size % msize_in_bytes) == 0);
8890 VIXL_ASSERT((vl % msize_in_bytes) == 0);
8891 size_t elements_per_page = page_size / msize_in_bytes;
8892 size_t elements_per_access = vl / esize_in_bytes;
8893 size_t min_offset = (elements_per_page - elements_per_access) + 1;
8894 size_t max_offset = elements_per_page - 1;
8895 size_t offset =
8896 min_offset + (offset_modifier % (max_offset - min_offset + 1));
8897 offset_modifier++;
8898
8899 __ Setffr();
8900 __ Mov(x20, data);
8901 __ Mov(x21, offset);
8902
TatWai Chong1af34f12020-06-01 20:54:06 -07008903 if (base_type == CPURegister::kRegister) {
8904 // Scalar-plus-scalar mode.
Martyn Capewella5112342020-06-05 18:20:11 +01008905 VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8906 VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8907 (static_cast<int>(mod) == NO_SHIFT));
8908 (masm.*ldff1)(z0.WithLaneSize(esize_in_bits),
8909 all.Zeroing(),
8910 SVEMemOperand(x20, x21, mod, msize_in_bytes_log2));
8911 } else {
8912 VIXL_ASSERT(base_type == CPURegister::kZRegister);
TatWai Chong1af34f12020-06-01 20:54:06 -07008913 int offs_size;
8914 bool offs_is_unsigned;
Martyn Capewella5112342020-06-05 18:20:11 +01008915 if (std::is_same<T, vixl::aarch64::Extend>::value) {
TatWai Chong1af34f12020-06-01 20:54:06 -07008916 // Scalar-plus-vector mode with 32-bit optional unpacked or upacked, and
8917 // unscaled or scaled offset.
Martyn Capewella5112342020-06-05 18:20:11 +01008918 VIXL_ASSERT((static_cast<int>(mod) == SXTW) ||
8919 (static_cast<int>(mod) == UXTW));
TatWai Chong1af34f12020-06-01 20:54:06 -07008920 if (scale == true) {
8921 // Gather first-fault bytes load doesn't support scaled offset.
8922 VIXL_ASSERT(msize_in_bits != kBRegSize);
8923 }
Martyn Capewella5112342020-06-05 18:20:11 +01008924 offs_is_unsigned = (static_cast<int>(mod) == UXTW) ? true : false;
TatWai Chong1af34f12020-06-01 20:54:06 -07008925 offs_size = kSRegSize;
8926
8927 } else {
8928 // Scalar-plus-vector mode with 64-bit unscaled or scaled offset.
Martyn Capewella5112342020-06-05 18:20:11 +01008929 VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8930 VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8931 (static_cast<int>(mod) == NO_SHIFT));
TatWai Chong1af34f12020-06-01 20:54:06 -07008932 offs_is_unsigned = false;
8933 offs_size = kDRegSize;
8934 }
8935
TatWai Chong6537a9a2020-05-05 14:15:16 -07008936 // For generating the pattern of "base address + index << shift".
8937 // In case of unscaled-offset operation, use `msize_in_bytes` be an offset
8938 // of each decreasing memory accesses. otherwise, decreases the indexes by 1
8939 // and then scale it by the shift value.
8940 int shift = (scale == true) ? msize_in_bytes_log2 : 0;
8941 int index_offset = msize_in_bytes >> shift;
8942 VIXL_ASSERT(index_offset > 0);
TatWai Chong6537a9a2020-05-05 14:15:16 -07008943 uint64_t index = 0;
8944 uint64_t base_address = 0;
8945
TatWai Chong1af34f12020-06-01 20:54:06 -07008946 if (offs_is_unsigned == true) {
TatWai Chong6537a9a2020-05-05 14:15:16 -07008947 // Base address.
8948 base_address = data;
8949 // Maximum unsigned positive index.
8950 index = page_size >> shift;
8951
8952 } else {
8953 // Base address.
8954 base_address = data + (2 * page_size);
8955 // Maximum unsigned positive index.
8956 uint64_t uint_e_max =
8957 (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
8958 index = uint_e_max - (page_size >> shift) + 1;
8959 }
8960
8961 __ Mov(x19, base_address);
8962 if ((offs_size == kSRegSize) && (esize_in_bits == kDRegSize)) {
8963 // In this case, the index values are optionally sign or zero-extended
8964 // from 32 to 64 bits, assign a convenient value to the top 32 bits to
8965 // ensure only the low 32 bits be the index values.
8966 index |= 0x1234567800000000;
8967 }
8968
8969 index -= index_offset * (elements_per_access - 1);
8970 __ Index(z17.WithLaneSize(esize_in_bits), index, index_offset);
8971
8972 // Scalar plus vector mode.
8973 (masm.*
8974 ldff1)(z0.WithLaneSize(esize_in_bits),
8975 all.Zeroing(),
8976 SVEMemOperand(x19, z17.WithLaneSize(esize_in_bits), mod, shift));
8977 }
8978
8979 __ Rdffrs(p0.VnB(), all.Zeroing());
8980
8981 // Execute another Ldff1 with no offset, so that every element could be
8982 // read. It should respect FFR, and load no more than we loaded the
8983 // first time.
8984 (masm.*
8985 ldff1)(z16.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x20));
8986 __ Rdffrs(p1.VnB(), all.Zeroing());
8987 __ Cntp(x0, all, p1.VnB());
8988 __ Uqdecp(x0, p0.VnB());
8989 __ Add(ffr_grow_count, ffr_grow_count, x0);
8990
8991 // Use the FFR to predicate the normal load. If it wasn't properly set,
8992 // the normal load will abort.
8993 (masm.*ld1)(z16.WithLaneSize(esize_in_bits),
8994 p0.Zeroing(),
8995 SVEMemOperand(x20, x21, LSL, msize_in_bytes_log2));
8996
8997 // Work out the address after the one that was just accessed.
8998 __ Incp(x21, p0.WithLaneSize(esize_in_bits));
8999 __ Add(x0, x20, Operand(x21, LSL, msize_in_bytes_log2));
9000 __ Cmp(limit, x0);
9001 __ Csel(limit, limit, x0, hs);
9002
9003 // Clear lanes inactive in FFR. These have an undefined result.
Martyn Capewella24d95c2020-05-20 11:11:15 +01009004 __ Not(p0.VnB(), all.Zeroing(), p0.VnB());
Martyn Capewelle2de6072020-05-22 09:52:06 +01009005 __ Mov(z0.WithLaneSize(esize_in_bits), p0.Merging(), 0);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009006
9007 END();
9008
9009 if (CAN_RUN()) {
9010 RUN();
9011
9012 uintptr_t expected_limit = data + page_size;
9013 uintptr_t measured_limit = core.xreg(limit.GetCode());
9014 VIXL_CHECK(measured_limit <= expected_limit);
9015 if (measured_limit < expected_limit) {
9016 // We can't fail the test for this case, but a warning is helpful for
9017 // manually-run tests.
9018 printf(
9019 "WARNING: All fault-tolerant loads detected faults before the\n"
9020 "expected limit. This is architecturally possible, but improbable,\n"
9021 "and could be a symptom of another problem.\n");
9022 }
9023
9024 ASSERT_EQUAL_64(0, ffr_grow_count);
9025
9026 ASSERT_EQUAL_SVE(z0.WithLaneSize(esize_in_bits),
9027 z16.WithLaneSize(esize_in_bits));
9028 }
9029}
9030
9031TEST_SVE(sve_ldff1_scalar_plus_scalar) {
9032 size_t page_size = sysconf(_SC_PAGE_SIZE);
9033 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9034
9035 // Allocate two pages, then mprotect the second one to make it inaccessible.
9036 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9037 page_size * 2,
9038 PROT_READ | PROT_WRITE,
9039 MAP_PRIVATE | MAP_ANONYMOUS,
9040 -1,
9041 0));
9042 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9043
9044 // Fill the accessible page with arbitrary data.
9045 for (size_t i = 0; i < page_size; i++) {
9046 // Reverse bits so we get a mixture of positive and negative values.
9047 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9048 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9049 }
9050
Martyn Capewella5112342020-06-05 18:20:11 +01009051 auto ldff1_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009052 config,
9053 data,
9054 std::placeholders::_1,
9055 std::placeholders::_2,
9056 CPURegister::kRegister,
9057 std::placeholders::_3,
9058 std::placeholders::_4,
Martyn Capewella5112342020-06-05 18:20:11 +01009059 NO_SHIFT,
TatWai Chong1af34f12020-06-01 20:54:06 -07009060 false);
9061
TatWai Chong6537a9a2020-05-05 14:15:16 -07009062 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9063 Ld1Macro ld1b = &MacroAssembler::Ld1b;
TatWai Chong1af34f12020-06-01 20:54:06 -07009064 ldff1_unscaled_offset_helper(kBRegSize, kBRegSize, ldff1b, ld1b);
9065 ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1b, ld1b);
9066 ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1b, ld1b);
9067 ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1b, ld1b);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009068
9069 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9070 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
TatWai Chong1af34f12020-06-01 20:54:06 -07009071 ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1sb, ld1sb);
9072 ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1sb, ld1sb);
9073 ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1sb, ld1sb);
9074
Martyn Capewella5112342020-06-05 18:20:11 +01009075 auto ldff1_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009076 config,
9077 data,
9078 std::placeholders::_1,
9079 std::placeholders::_2,
9080 CPURegister::kRegister,
9081 std::placeholders::_3,
9082 std::placeholders::_4,
Martyn Capewella5112342020-06-05 18:20:11 +01009083 LSL,
TatWai Chong1af34f12020-06-01 20:54:06 -07009084 true);
9085
9086 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9087 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9088 ldff1_scaled_offset_helper(kHRegSize, kHRegSize, ldff1h, ld1h);
9089 ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1h, ld1h);
9090 ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1h, ld1h);
9091
9092 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9093 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9094 ldff1_scaled_offset_helper(kSRegSize, kSRegSize, ldff1w, ld1w);
9095 ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1w, ld1w);
9096
9097 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9098 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9099 ldff1_scaled_offset_helper(kDRegSize, kDRegSize, ldff1d, ld1d);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009100
9101 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9102 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
TatWai Chong1af34f12020-06-01 20:54:06 -07009103 ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1sh, ld1sh);
9104 ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1sh, ld1sh);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009105
9106 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9107 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
TatWai Chong1af34f12020-06-01 20:54:06 -07009108 ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1sw, ld1sw);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009109
9110 munmap(reinterpret_cast<void*>(data), page_size * 2);
9111}
9112
TatWai Chong1af34f12020-06-01 20:54:06 -07009113static void sve_ldff1_scalar_plus_vector_32_scaled_offset(Test* config,
9114 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009115 auto ldff1_32_scaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009116 config,
9117 data,
9118 std::placeholders::_1,
9119 kSRegSize,
9120 CPURegister::kZRegister,
9121 std::placeholders::_2,
9122 std::placeholders::_3,
9123 std::placeholders::_4,
9124 true);
9125 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9126 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009127 ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9128 ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009129
9130 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9131 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009132 ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9133 ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009134
9135 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9136 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009137 ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9138 ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009139}
9140
9141static void sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test* config,
9142 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009143 auto ldff1_32_unscaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009144 config,
9145 data,
9146 std::placeholders::_1,
9147 kSRegSize,
9148 CPURegister::kZRegister,
9149 std::placeholders::_2,
9150 std::placeholders::_3,
9151 std::placeholders::_4,
9152 false);
9153
9154 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9155 Ld1Macro ld1b = &MacroAssembler::Ld1b;
Martyn Capewella5112342020-06-05 18:20:11 +01009156 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9157 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009158
9159 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9160 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009161 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9162 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009163
9164 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9165 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009166 ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9167 ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009168
9169 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9170 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
Martyn Capewella5112342020-06-05 18:20:11 +01009171 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9172 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009173
9174 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9175 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009176 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9177 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009178}
9179
9180static void sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(
9181 Test* config, uintptr_t data) {
9182 auto ldff1_32_unpacked_scaled_offset_helper =
Martyn Capewella5112342020-06-05 18:20:11 +01009183 std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009184 config,
9185 data,
9186 std::placeholders::_1,
9187 kDRegSize,
9188 CPURegister::kZRegister,
9189 std::placeholders::_2,
9190 std::placeholders::_3,
9191 std::placeholders::_4,
9192 true);
9193
9194 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9195 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009196 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9197 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009198
9199 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9200 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009201 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9202 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009203
9204 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9205 Ld1Macro ld1d = &MacroAssembler::Ld1d;
Martyn Capewella5112342020-06-05 18:20:11 +01009206 ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9207 ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009208
9209 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9210 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009211 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9212 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009213
9214 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9215 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Martyn Capewella5112342020-06-05 18:20:11 +01009216 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9217 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009218}
9219
9220static void sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(
9221 Test* config, uintptr_t data) {
9222 auto ldff1_32_unpacked_unscaled_offset_helper =
Martyn Capewella5112342020-06-05 18:20:11 +01009223 std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009224 config,
9225 data,
9226 std::placeholders::_1,
9227 kDRegSize,
9228 CPURegister::kZRegister,
9229 std::placeholders::_2,
9230 std::placeholders::_3,
9231 std::placeholders::_4,
9232 false);
9233
9234 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9235 Ld1Macro ld1b = &MacroAssembler::Ld1b;
Martyn Capewella5112342020-06-05 18:20:11 +01009236 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9237 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009238
9239 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9240 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009241 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9242 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009243
9244 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9245 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009246 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9247 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009248
9249 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9250 Ld1Macro ld1d = &MacroAssembler::Ld1d;
Martyn Capewella5112342020-06-05 18:20:11 +01009251 ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9252 ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009253
9254 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9255 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
Martyn Capewella5112342020-06-05 18:20:11 +01009256 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9257 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009258
9259 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9260 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009261 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9262 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009263
9264 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9265 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Martyn Capewella5112342020-06-05 18:20:11 +01009266 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9267 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009268}
9269
9270static void sve_ldff1_scalar_plus_vector_64_scaled_offset(Test* config,
9271 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009272 auto ldff1_64_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009273 config,
9274 data,
9275 std::placeholders::_1,
9276 kDRegSize,
9277 CPURegister::kZRegister,
9278 std::placeholders::_2,
9279 std::placeholders::_3,
Martyn Capewella5112342020-06-05 18:20:11 +01009280 LSL,
TatWai Chong1af34f12020-06-01 20:54:06 -07009281 true);
9282
9283 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9284 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9285 ldff1_64_scaled_offset_helper(kHRegSize, ldff1h, ld1h);
9286
9287 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9288 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9289 ldff1_64_scaled_offset_helper(kSRegSize, ldff1w, ld1w);
9290
9291 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9292 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9293 ldff1_64_scaled_offset_helper(kDRegSize, ldff1d, ld1d);
9294
9295 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9296 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9297 ldff1_64_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9298
9299 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9300 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9301 ldff1_64_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9302}
9303
9304static void sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test* config,
9305 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009306 auto ldff1_64_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009307 config,
9308 data,
9309 std::placeholders::_1,
9310 kDRegSize,
9311 CPURegister::kZRegister,
9312 std::placeholders::_2,
9313 std::placeholders::_3,
Martyn Capewella5112342020-06-05 18:20:11 +01009314 NO_SHIFT,
TatWai Chong1af34f12020-06-01 20:54:06 -07009315 false);
9316
9317 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9318 Ld1Macro ld1b = &MacroAssembler::Ld1b;
9319 ldff1_64_unscaled_offset_helper(kBRegSize, ldff1b, ld1b);
9320
9321 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9322 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9323 ldff1_64_unscaled_offset_helper(kHRegSize, ldff1h, ld1h);
9324
9325 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9326 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9327 ldff1_64_unscaled_offset_helper(kSRegSize, ldff1w, ld1w);
9328
9329 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9330 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9331 ldff1_64_unscaled_offset_helper(kDRegSize, ldff1d, ld1d);
9332
9333 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9334 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9335 ldff1_64_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb);
9336
9337 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9338 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9339 ldff1_64_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9340
9341 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9342 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9343 ldff1_64_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9344}
9345
TatWai Chong6537a9a2020-05-05 14:15:16 -07009346TEST_SVE(sve_ldff1_scalar_plus_vector) {
9347 size_t page_size = sysconf(_SC_PAGE_SIZE);
9348 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9349
9350 // Allocate two pages, then mprotect the second one to make it inaccessible.
9351 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9352 page_size * 2,
9353 PROT_READ | PROT_WRITE,
9354 MAP_PRIVATE | MAP_ANONYMOUS,
9355 -1,
9356 0));
9357 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9358
9359 // Fill the accessible page with arbitrary data.
9360 for (size_t i = 0; i < page_size; i++) {
9361 // Reverse bits so we get a mixture of positive and negative values.
9362 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9363 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9364 }
9365
TatWai Chong1af34f12020-06-01 20:54:06 -07009366 sve_ldff1_scalar_plus_vector_32_scaled_offset(config, data);
9367 sve_ldff1_scalar_plus_vector_32_unscaled_offset(config, data);
9368 sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(config, data);
9369 sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(config, data);
9370 sve_ldff1_scalar_plus_vector_64_scaled_offset(config, data);
9371 sve_ldff1_scalar_plus_vector_64_unscaled_offset(config, data);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009372
9373 munmap(reinterpret_cast<void*>(data), page_size * 2);
9374}
9375
Martyn Capewell5f9b3802020-03-24 16:16:36 +00009376TEST_SVE(sve_ldnf1) {
9377 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
9378 CPUFeatures::kNEON,
9379 CPUFeatures::kFP);
9380 START();
9381
9382 size_t page_size = sysconf(_SC_PAGE_SIZE);
9383 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9384
9385 // Allocate two pages, fill them with data, then mprotect the second one to
9386 // make it inaccessible.
9387 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9388 page_size * 2,
9389 PROT_READ | PROT_WRITE,
9390 MAP_PRIVATE | MAP_ANONYMOUS,
9391 -1,
9392 0));
9393
9394 // Fill the pages with arbitrary data.
9395 for (size_t i = 0; i < page_size; i++) {
9396 // Reverse bits so we get a mixture of positive and negative values.
9397 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9398 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9399 }
9400
9401 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9402
9403 __ Setffr();
9404 __ Ptrue(p0.VnB());
9405 __ Dup(z10.VnB(), 0);
9406
9407 // Move an address that points to the last unprotected eight bytes.
9408 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kBRegSizeInBytes) / 2);
9409
9410 // Load, non-faulting, a vector of bytes from x0. At most, eight bytes will be
9411 // loaded, the rest being in a protected page.
9412 __ Ldnf1b(z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
9413 __ Rdffr(p1.VnB());
9414 __ Setffr();
9415
9416 // Create references using the FFR value in p1 to zero the undefined lanes.
9417 __ Sel(z0.VnB(), p1, z0.VnB(), z10.VnB());
9418 __ Ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0));
9419
9420 // Repeat for larger elements and different addresses, giving different FFR
9421 // results.
9422 __ Add(x1, x0, 1);
9423 __ Ldnf1h(z1.VnH(), p0.Zeroing(), SVEMemOperand(x1));
9424 __ Rdffr(p1.VnB());
9425 __ Setffr();
9426 __ Sel(z1.VnH(), p1, z1.VnH(), z10.VnH());
9427 __ Ld1h(z21.VnH(), p1.Zeroing(), SVEMemOperand(x1));
9428
9429 __ Add(x1, x0, 2);
9430 __ Ldnf1w(z2.VnS(), p0.Zeroing(), SVEMemOperand(x1));
9431 __ Rdffr(p1.VnB());
9432 __ Setffr();
9433 __ Sel(z2.VnS(), p1, z2.VnS(), z10.VnS());
9434 __ Ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x1));
9435
9436 __ Sub(x1, x0, 1);
9437 __ Ldnf1d(z3.VnD(), p0.Zeroing(), SVEMemOperand(x1));
9438 __ Rdffr(p1.VnB());
9439 __ Setffr();
9440 __ Sel(z3.VnD(), p1, z3.VnD(), z10.VnD());
9441 __ Ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x1));
9442
9443 // Load from previous VL-sized area of memory. All of this should be in the
9444 // accessible page.
9445 __ Ldnf1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9446 __ Rdffr(p1.VnB());
9447 __ Setffr();
9448 __ Sel(z4.VnB(), p1, z4.VnB(), z10.VnB());
9449 __ Ld1b(z24.VnB(), p1.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9450
9451 // Repeat partial load for larger element size.
9452 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kSRegSizeInBytes) / 2);
9453 __ Ldnf1b(z5.VnS(), p0.Zeroing(), SVEMemOperand(x0));
9454 __ Rdffr(p1.VnB());
9455 __ Setffr();
9456 __ Sel(z5.VnS(), p1, z5.VnS(), z10.VnS());
9457 __ Ld1b(z25.VnS(), p1.Zeroing(), SVEMemOperand(x0));
9458
9459 // Repeat for sign extension.
9460 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kHRegSizeInBytes) / 2);
9461 __ Ldnf1sb(z6.VnH(), p0.Zeroing(), SVEMemOperand(x0));
9462 __ Rdffr(p1.VnB());
9463 __ Setffr();
9464 __ Sel(z6.VnH(), p1, z6.VnH(), z10.VnH());
9465 __ Ld1sb(z26.VnH(), p1.Zeroing(), SVEMemOperand(x0));
9466
9467 END();
9468
9469 if (CAN_RUN()) {
9470 RUN();
9471 ASSERT_EQUAL_SVE(z20, z0);
9472 ASSERT_EQUAL_SVE(z21, z1);
9473 ASSERT_EQUAL_SVE(z22, z2);
9474 ASSERT_EQUAL_SVE(z23, z3);
9475 ASSERT_EQUAL_SVE(z24, z4);
9476 ASSERT_EQUAL_SVE(z25, z5);
9477 ASSERT_EQUAL_SVE(z26, z6);
9478 }
9479
9480 munmap(reinterpret_cast<void*>(data), page_size * 2);
9481}
9482
TatWai Chongcd3f6c52020-06-14 00:42:39 -07009483// Emphasis on test if the modifiers are propagated and simulated correctly.
9484TEST_SVE(sve_ldff1_regression_test) {
9485 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9486 START();
9487
9488 size_t page_size = sysconf(_SC_PAGE_SIZE);
9489 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9490
9491 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9492 page_size * 2,
9493 PROT_READ | PROT_WRITE,
9494 MAP_PRIVATE | MAP_ANONYMOUS,
9495 -1,
9496 0));
9497 uintptr_t middle = data + page_size;
9498 // Fill the accessible page with arbitrary data.
9499 for (size_t i = 0; i < page_size; i++) {
9500 // Reverse bits so we get a mixture of positive and negative values.
9501 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9502 memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9503 // Make one bit roughly different in every byte and copy the bytes in the
9504 // reverse direction that convenient to verifying the loads in negative
9505 // indexes.
9506 byte += 1;
9507 memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9508 }
9509
9510 PRegister all = p6;
9511 __ Ptrue(all.VnB());
9512
9513 __ Mov(x0, middle);
9514 __ Index(z31.VnS(), 0, 3);
9515 __ Neg(z30.VnS(), z31.VnS());
9516
9517 __ Setffr();
9518
9519 // Scalar plus vector 32 unscaled offset
9520 __ Ldff1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9521 __ Ldff1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9522 __ Ldff1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9523 __ Ldff1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9524 __ Ldff1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9525
9526 // Scalar plus vector 32 scaled offset
9527 __ Ldff1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9528 __ Ldff1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9529 __ Ldff1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9530
9531 __ Index(z31.VnD(), 0, 3);
9532 __ Neg(z30.VnD(), z31.VnD());
9533
9534 // Ensure only the low 32 bits are used for the testing with positive index
9535 // values. It also test if the indexes are treated as positive in `uxtw` form.
9536 __ Mov(x3, 0x8000000080000000);
9537 __ Dup(z28.VnD(), x3);
9538 __ Sub(x2, x0, 0x80000000);
9539 __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9540
9541 // Scalar plus vector 32 unpacked unscaled offset
9542 __ Ldff1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9543 __ Ldff1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9544 __ Ldff1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9545 __ Ldff1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9546 __ Ldff1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9547 __ Ldff1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9548
9549 // Scalar plus vector 32 unpacked scaled offset
9550 __ Ldff1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9551 __ Ldff1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9552 __ Ldff1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9553 __ Ldff1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9554 __ Ldff1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9555
9556 __ Sub(x0, x0, x3);
9557 // Note that the positive indexes has been added by `0x8000000080000000`. The
9558 // wrong address will be accessed if the address is treated as negative.
9559
9560 // Scalar plus vector 64 unscaled offset
9561 __ Ldff1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9562 __ Ldff1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9563 __ Ldff1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9564 __ Ldff1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9565 __ Ldff1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9566
9567 // Scalar plus vector 64 scaled offset
9568 __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
9569 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9570 __ Ldff1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9571 __ Ldff1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9572
9573 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
9574 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9575 __ Ldff1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9576 __ Ldff1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9577
9578 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
9579 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9580 __ Ldff1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9581
9582 __ Rdffr(p1.VnB());
9583 __ Cntp(x10, all, p1.VnB());
9584
9585 END();
9586
9587 if (CAN_RUN()) {
9588 RUN();
9589
9590 int64_t loaded_data_in_bytes = core.xreg(x10.GetCode());
9591 // Only check 128 bits in this test.
9592 if (loaded_data_in_bytes < kQRegSizeInBytes) {
9593 // Report a warning when we hit fault-tolerant loads before all expected
9594 // loads performed.
9595 printf(
9596 "WARNING: Fault-tolerant loads detected faults before the "
9597 "expected loads completed.\n");
9598 return;
9599 }
9600
9601 // Scalar plus vector 32 unscaled offset
9602 uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9603 uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9604 uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9605 uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9606 uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9607
9608 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9609 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9610 ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9611 ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9612 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9613
9614 // Scalar plus vector 32 scaled offset
9615 uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9616 uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9617 uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9618
9619 ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9620 ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9621 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9622
9623 // Scalar plus vector 32 unpacked unscaled offset
9624 uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9625 uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9626 uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9627 uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9628 uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9629 uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9630
9631 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9632 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9633 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9634 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9635 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9636 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9637
9638 // Scalar plus vector 32 unpacked scaled offset
9639 uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9640 uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9641 uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9642 uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9643 uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9644
9645 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9646 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9647 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9648 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9649 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9650
9651 // Scalar plus vector 64 unscaled offset
9652 uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9653 uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9654 uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9655 uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9656 uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9657
9658 ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9659 ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9660 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9661 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9662 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9663
9664 uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9665 uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9666 uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9667 uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9668 uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9669
9670 // Scalar plus vector 64 scaled offset
9671 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9672 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9673 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9674 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9675 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9676 }
9677}
9678
Martyn Capewella5112342020-06-05 18:20:11 +01009679// Emphasis on test if the modifiers are propagated and simulated correctly.
9680TEST_SVE(sve_ld1_regression_test) {
9681 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9682 START();
9683
9684 size_t page_size = sysconf(_SC_PAGE_SIZE);
9685 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9686
9687 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9688 page_size * 2,
9689 PROT_READ | PROT_WRITE,
9690 MAP_PRIVATE | MAP_ANONYMOUS,
9691 -1,
9692 0));
9693 uintptr_t middle = data + page_size;
9694 // Fill the accessible page with arbitrary data.
9695 for (size_t i = 0; i < page_size; i++) {
9696 // Reverse bits so we get a mixture of positive and negative values.
9697 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9698 memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9699 // Make one bit roughly different in every byte and copy the bytes in the
9700 // reverse direction that convenient to verifying the loads in negative
9701 // indexes.
9702 byte += 1;
9703 memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9704 }
9705
9706 PRegister all = p6;
9707 __ Ptrue(all.VnB());
9708
9709 __ Mov(x0, middle);
9710 __ Index(z31.VnS(), 0, 3);
9711 __ Neg(z30.VnS(), z31.VnS());
9712
9713 // Scalar plus vector 32 unscaled offset
9714 __ Ld1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9715 __ Ld1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9716 __ Ld1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9717 __ Ld1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9718 __ Ld1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9719
9720 // Scalar plus vector 32 scaled offset
9721 __ Ld1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9722 __ Ld1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9723 __ Ld1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9724
9725 __ Index(z31.VnD(), 0, 3);
9726 __ Neg(z30.VnD(), z31.VnD());
9727
9728 // Ensure only the low 32 bits are used for the testing with positive index
9729 // values. It also test if the indexes are treated as positive in `uxtw` form.
9730 __ Mov(x3, 0x8000000080000000);
9731 __ Dup(z28.VnD(), x3);
9732 __ Sub(x2, x0, 0x80000000);
9733 __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9734
9735 // Scalar plus vector 32 unpacked unscaled offset
9736 __ Ld1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9737 __ Ld1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9738 __ Ld1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9739 __ Ld1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9740 __ Ld1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9741 __ Ld1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9742
9743 // Scalar plus vector 32 unpacked scaled offset
9744 __ Ld1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9745 __ Ld1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9746 __ Ld1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9747 __ Ld1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9748 __ Ld1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9749
9750 __ Sub(x0, x0, x3);
9751 // Note that the positive indexes has been added by `0x8000000080000000`. The
9752 // wrong address will be accessed if the address is treated as negative.
9753
9754 // Scalar plus vector 64 unscaled offset
9755 __ Ld1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9756 __ Ld1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9757 __ Ld1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9758 __ Ld1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9759 __ Ld1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9760
9761 // Scalar plus vector 64 scaled offset
9762 __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
9763 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9764 __ Ld1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9765 __ Ld1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9766
9767 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
9768 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9769 __ Ld1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9770 __ Ld1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9771
9772 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
9773 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9774 __ Ld1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9775
9776 END();
9777
9778 if (CAN_RUN()) {
9779 RUN();
9780
9781 // Scalar plus vector 32 unscaled offset
9782 uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9783 uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9784 uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9785 uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9786 uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9787
9788 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9789 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9790 ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9791 ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9792 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9793
9794 // Scalar plus vector 32 scaled offset
9795 uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9796 uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9797 uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9798
9799 ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9800 ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9801 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9802
9803 // Scalar plus vector 32 unpacked unscaled offset
9804 uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9805 uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9806 uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9807 uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9808 uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9809 uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9810
9811 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9812 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9813 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9814 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9815 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9816 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9817
9818 // Scalar plus vector 32 unpacked scaled offset
9819 uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9820 uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9821 uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9822 uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9823 uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9824
9825 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9826 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9827 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9828 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9829 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9830
9831 // Scalar plus vector 64 unscaled offset
9832 uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9833 uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9834 uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9835 uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9836 uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9837
9838 ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9839 ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9840 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9841 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9842 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9843
9844 uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9845 uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9846 uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9847 uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9848 uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9849
9850 // Scalar plus vector 64 scaled offset
9851 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9852 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9853 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9854 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9855 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9856 }
9857}
9858
TatWai Chong113d9192020-05-19 01:02:36 -07009859// Test gather loads by comparing them with the result of a set of equivalent
9860// scalar loads.
Martyn Capewella5112342020-06-05 18:20:11 +01009861template <typename T>
TatWai Chong113d9192020-05-19 01:02:36 -07009862static void GatherLoadScalarPlusVectorHelper(Test* config,
9863 unsigned msize_in_bits,
9864 unsigned esize_in_bits,
9865 Ld1Macro ld1,
TatWai Chong6537a9a2020-05-05 14:15:16 -07009866 Ld1Macro ldff1,
Martyn Capewella5112342020-06-05 18:20:11 +01009867 T mod,
TatWai Chong113d9192020-05-19 01:02:36 -07009868 bool is_signed,
9869 bool is_scaled) {
9870 // SVE supports 32- and 64-bit addressing for gather loads.
9871 VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9872 static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9873
9874 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9875 START();
9876
9877 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
9878 int vl = config->sve_vl_in_bytes();
9879
9880 uint64_t addresses[kMaxLaneCount];
9881 uint64_t offsets[kMaxLaneCount];
9882 uint64_t max_address = 0;
9883 uint64_t buffer_size = vl * 64;
9884 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9885 // Fill the buffer with arbitrary data. Meanwhile, create the random addresses
9886 // and offsets into the buffer placed in the argument list.
9887 BufferFillingHelper(data,
9888 buffer_size,
9889 msize_in_bytes,
9890 kMaxLaneCount,
9891 offsets,
9892 addresses,
9893 &max_address);
9894
9895 ZRegister zn = z0.WithLaneSize(esize_in_bits);
9896 ZRegister zt_ref = z1.WithLaneSize(esize_in_bits);
Martyn Capewella5112342020-06-05 18:20:11 +01009897 ZRegister zt = z2.WithLaneSize(esize_in_bits);
9898 ZRegister zt_ff = z3.WithLaneSize(esize_in_bits);
9899 PRegisterWithLaneSize pg_ff = p1.WithLaneSize(esize_in_bits);
9900 PRegisterWithLaneSize pg_diff = p2.WithLaneSize(esize_in_bits);
TatWai Chong113d9192020-05-19 01:02:36 -07009901
9902 int shift = 0;
9903 if (is_scaled) {
9904 shift = std::log2(msize_in_bytes);
9905 for (unsigned i = 0; i < kMaxLaneCount; i++) {
9906 // Ensure the offsets are the multiple of the scale factor of the
9907 // operation.
9908 offsets[i] = (offsets[i] >> shift) << shift;
9909 addresses[i] = data + offsets[i];
9910 }
9911 }
9912
9913 PRegister all = p6;
9914 __ Ptrue(all.WithLaneSize(esize_in_bits));
9915
9916 PRegisterZ pg = p0.Zeroing();
9917 Initialise(&masm,
9918 pg,
9919 0x9abcdef012345678,
9920 0xabcdef0123456789,
9921 0xf4f3f1f0fefdfcfa,
9922 0xf9f8f6f5f3f2f1ff);
9923
9924 __ Mov(x0, data);
9925
9926 // Generate a reference result for scalar-plus-scalar form using scalar loads.
9927 ScalarLoadHelper(&masm,
9928 vl,
9929 addresses,
9930 zt_ref,
9931 pg,
9932 esize_in_bits,
9933 msize_in_bits,
9934 is_signed);
9935
9936 InsrHelper(&masm, zn, offsets);
9937 if (is_scaled) {
9938 // Scale down the offsets if testing scaled-offset operation.
9939 __ Lsr(zn, zn, shift);
9940 }
9941
Martyn Capewella5112342020-06-05 18:20:11 +01009942 (masm.*ld1)(zt, pg, SVEMemOperand(x0, zn, mod, shift));
TatWai Chong113d9192020-05-19 01:02:36 -07009943
TatWai Chong6537a9a2020-05-05 14:15:16 -07009944 Register ffr_check_count = x17;
9945 __ Mov(ffr_check_count, 0);
9946
TatWai Chong6537a9a2020-05-05 14:15:16 -07009947 // Test the data correctness in which the data gather load from different
9948 // addresses. The first-fault behavior test is emphasized in `Ldff1Helper`.
9949 __ Setffr();
Martyn Capewella5112342020-06-05 18:20:11 +01009950 (masm.*ldff1)(zt_ff, pg, SVEMemOperand(x0, zn, mod, shift));
9951
9952 // Compare these two vector register and place the different to
9953 // `ffr_check_count`.
9954 __ Rdffrs(pg_ff.VnB(), all.Zeroing());
9955 __ Cmpeq(pg_diff, all.Zeroing(), zt_ref, zt_ff);
9956 __ Eor(pg_diff.VnB(), all.Zeroing(), pg_diff.VnB(), pg_ff.VnB());
9957 __ Incp(ffr_check_count, pg_diff);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009958
TatWai Chong113d9192020-05-19 01:02:36 -07009959 END();
9960
9961 if (CAN_RUN()) {
9962 RUN();
9963
Martyn Capewella5112342020-06-05 18:20:11 +01009964 ASSERT_EQUAL_SVE(zt_ref, zt);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009965 ASSERT_EQUAL_64(0, ffr_check_count);
TatWai Chong113d9192020-05-19 01:02:36 -07009966 }
9967
9968 free(reinterpret_cast<void*>(data));
9969}
9970
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009971// Test gather loads by comparing them with the result of a set of equivalent
9972// scalar loads.
9973template <typename F>
TatWai Chong113d9192020-05-19 01:02:36 -07009974static void GatherLoadScalarPlusScalarOrImmHelper(Test* config,
9975 unsigned msize_in_bits,
9976 unsigned esize_in_bits,
9977 F sve_ld1,
9978 bool is_signed) {
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009979 // SVE supports 32- and 64-bit addressing for gather loads.
9980 VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9981 static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9982
9983 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9984 START();
9985
9986 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009987 int vl = config->sve_vl_in_bytes();
9988
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009989 uint64_t addresses[kMaxLaneCount];
9990 uint64_t offsets[kMaxLaneCount];
9991 uint64_t max_address = 0;
TatWai Chong85e15102020-05-04 21:00:40 -07009992 uint64_t buffer_size = vl * 64;
9993 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9994 BufferFillingHelper(data,
9995 buffer_size,
9996 msize_in_bytes,
9997 kMaxLaneCount,
9998 offsets,
9999 addresses,
10000 &max_address);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010001
10002 // Maximised offsets, to ensure that the address calculation is modulo-2^64,
10003 // and that the vector addresses are not sign-extended.
10004 uint64_t uint_e_max = (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
10005 uint64_t maxed_offsets[kMaxLaneCount];
10006 uint64_t maxed_offsets_imm = max_address - uint_e_max;
10007 for (unsigned i = 0; i < kMaxLaneCount; i++) {
10008 maxed_offsets[i] = addresses[i] - maxed_offsets_imm;
10009 }
10010
10011 ZRegister zn = z0.WithLaneSize(esize_in_bits);
10012 ZRegister zt_addresses = z1.WithLaneSize(esize_in_bits);
10013 ZRegister zt_offsets = z2.WithLaneSize(esize_in_bits);
10014 ZRegister zt_maxed = z3.WithLaneSize(esize_in_bits);
10015 ZRegister zt_ref = z4.WithLaneSize(esize_in_bits);
10016
10017 PRegisterZ pg = p0.Zeroing();
10018 Initialise(&masm,
10019 pg,
10020 0x9abcdef012345678,
10021 0xabcdef0123456789,
10022 0xf4f3f1f0fefdfcfa,
10023 0xf9f8f6f5f3f2f0ff);
10024
10025 // Execute each load.
10026
10027 if (esize_in_bits == kDRegSize) {
10028 // Only test `addresses` if we can use 64-bit pointers. InsrHelper will fail
10029 // if any value won't fit in a lane of zn.
10030 InsrHelper(&masm, zn, addresses);
10031 (masm.*sve_ld1)(zt_addresses, pg, SVEMemOperand(zn));
10032 }
10033
10034 InsrHelper(&masm, zn, offsets);
10035 (masm.*sve_ld1)(zt_offsets, pg, SVEMemOperand(zn, data));
10036
10037 InsrHelper(&masm, zn, maxed_offsets);
10038 (masm.*sve_ld1)(zt_maxed, pg, SVEMemOperand(zn, maxed_offsets_imm));
10039
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010040 // Generate a reference result using scalar loads.
TatWai Chong85e15102020-05-04 21:00:40 -070010041 ScalarLoadHelper(&masm,
10042 vl,
10043 addresses,
10044 zt_ref,
10045 pg,
10046 esize_in_bits,
10047 msize_in_bits,
10048 is_signed);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010049
10050 END();
10051
10052 if (CAN_RUN()) {
10053 RUN();
10054
10055 if (esize_in_bits == kDRegSize) {
10056 ASSERT_EQUAL_SVE(zt_ref, zt_addresses);
10057 }
10058 ASSERT_EQUAL_SVE(zt_ref, zt_offsets);
10059 ASSERT_EQUAL_SVE(zt_ref, zt_maxed);
10060 }
10061
10062 free(reinterpret_cast<void*>(data));
10063}
10064
10065TEST_SVE(sve_ld1b_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010066 GatherLoadScalarPlusScalarOrImmHelper(config,
10067 kBRegSize,
10068 kDRegSize,
10069 &MacroAssembler::Ld1b,
10070 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010071}
10072
10073TEST_SVE(sve_ld1h_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010074 GatherLoadScalarPlusScalarOrImmHelper(config,
10075 kHRegSize,
10076 kDRegSize,
10077 &MacroAssembler::Ld1h,
10078 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010079}
10080
10081TEST_SVE(sve_ld1w_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010082 GatherLoadScalarPlusScalarOrImmHelper(config,
10083 kSRegSize,
10084 kDRegSize,
10085 &MacroAssembler::Ld1w,
10086 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010087}
10088
10089TEST_SVE(sve_ld1d_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010090 GatherLoadScalarPlusScalarOrImmHelper(config,
10091 kDRegSize,
10092 kDRegSize,
10093 &MacroAssembler::Ld1d,
10094 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010095}
10096
10097TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010098 GatherLoadScalarPlusScalarOrImmHelper(config,
10099 kBRegSize,
10100 kDRegSize,
10101 &MacroAssembler::Ld1sb,
10102 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010103}
10104
10105TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010106 GatherLoadScalarPlusScalarOrImmHelper(config,
10107 kHRegSize,
10108 kDRegSize,
10109 &MacroAssembler::Ld1sh,
10110 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010111}
10112
10113TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010114 GatherLoadScalarPlusScalarOrImmHelper(config,
10115 kSRegSize,
10116 kDRegSize,
10117 &MacroAssembler::Ld1sw,
10118 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010119}
10120
10121TEST_SVE(sve_ld1b_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010122 GatherLoadScalarPlusScalarOrImmHelper(config,
10123 kBRegSize,
10124 kSRegSize,
10125 &MacroAssembler::Ld1b,
10126 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010127}
10128
10129TEST_SVE(sve_ld1h_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010130 GatherLoadScalarPlusScalarOrImmHelper(config,
10131 kHRegSize,
10132 kSRegSize,
10133 &MacroAssembler::Ld1h,
10134 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010135}
10136
10137TEST_SVE(sve_ld1w_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010138 GatherLoadScalarPlusScalarOrImmHelper(config,
10139 kSRegSize,
10140 kSRegSize,
10141 &MacroAssembler::Ld1w,
10142 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010143}
10144
10145TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010146 GatherLoadScalarPlusScalarOrImmHelper(config,
10147 kBRegSize,
10148 kSRegSize,
10149 &MacroAssembler::Ld1sb,
10150 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010151}
10152
10153TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010154 GatherLoadScalarPlusScalarOrImmHelper(config,
10155 kHRegSize,
10156 kSRegSize,
10157 &MacroAssembler::Ld1sh,
10158 true);
10159}
10160
Martyn Capewella5112342020-06-05 18:20:11 +010010161TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset) {
10162 auto ld1_32_scaled_offset_helper =
10163 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10164 config,
10165 std::placeholders::_1,
10166 kSRegSize,
10167 std::placeholders::_2,
10168 std::placeholders::_3,
10169 std::placeholders::_4,
10170 std::placeholders::_5,
10171 true);
10172
10173 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10174 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10175 ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10176 ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10177
10178 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10179 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10180 ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10181 ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10182
10183 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10184 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10185 ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10186 ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010187}
10188
Martyn Capewella5112342020-06-05 18:20:11 +010010189TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset) {
10190 auto ld1_32_unscaled_offset_helper =
10191 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10192 config,
10193 std::placeholders::_1,
10194 kSRegSize,
10195 std::placeholders::_2,
10196 std::placeholders::_3,
10197 std::placeholders::_4,
10198 std::placeholders::_5,
10199 false);
TatWai Chong113d9192020-05-19 01:02:36 -070010200
Martyn Capewella5112342020-06-05 18:20:11 +010010201 Ld1Macro ld1b = &MacroAssembler::Ld1b;
10202 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10203 ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, UXTW, false);
10204 ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, SXTW, false);
10205
10206 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10207 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10208 ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10209 ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10210
10211 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10212 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10213 ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10214 ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10215
10216 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10217 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10218 ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, UXTW, true);
10219 ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, SXTW, true);
10220
10221 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10222 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10223 ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10224 ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010225}
10226
Martyn Capewella5112342020-06-05 18:20:11 +010010227TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset) {
10228 auto ld1_32_unpacked_scaled_offset_helper =
10229 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10230 config,
10231 std::placeholders::_1,
10232 kDRegSize,
10233 std::placeholders::_2,
10234 std::placeholders::_3,
10235 std::placeholders::_4,
10236 std::placeholders::_5,
10237 true);
TatWai Chong113d9192020-05-19 01:02:36 -070010238
Martyn Capewella5112342020-06-05 18:20:11 +010010239 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10240 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10241 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10242 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10243
10244 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10245 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10246 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10247 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10248
10249 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10250 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10251 ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10252 ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10253
10254 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10255 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10256 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10257 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10258
10259 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10260 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10261 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10262 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010263}
10264
Martyn Capewella5112342020-06-05 18:20:11 +010010265TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset) {
10266 auto ld1_32_unpacked_unscaled_offset_helper =
10267 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10268 config,
10269 std::placeholders::_1,
10270 kDRegSize,
10271 std::placeholders::_2,
10272 std::placeholders::_3,
10273 std::placeholders::_4,
10274 std::placeholders::_5,
10275 false);
10276
10277 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10278 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10279 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10280 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10281
10282 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10283 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10284 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10285 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10286
10287 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10288 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10289 ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10290 ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10291
10292 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10293 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10294 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10295 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10296
10297 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10298 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10299 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10300 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010301}
10302
Martyn Capewella5112342020-06-05 18:20:11 +010010303TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset) {
10304 auto ld1_64_scaled_offset_helper =
10305 std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10306 config,
10307 std::placeholders::_1,
10308 kDRegSize,
10309 std::placeholders::_2,
10310 std::placeholders::_3,
10311 LSL,
10312 std::placeholders::_4,
10313 true);
TatWai Chong113d9192020-05-19 01:02:36 -070010314
Martyn Capewella5112342020-06-05 18:20:11 +010010315 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10316 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10317 ld1_64_scaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10318
10319 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10320 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10321 ld1_64_scaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10322
10323 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10324 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10325 ld1_64_scaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10326
10327 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10328 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10329 ld1_64_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10330
10331 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10332 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10333 ld1_64_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
10334}
10335
10336TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset) {
10337 auto ld1_64_unscaled_offset_helper =
10338 std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10339 config,
10340 std::placeholders::_1,
10341 kDRegSize,
10342 std::placeholders::_2,
10343 std::placeholders::_3,
10344 NO_SHIFT,
10345 std::placeholders::_4,
10346 false);
10347
10348 Ld1Macro ld1b = &MacroAssembler::Ld1b;
10349 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10350 ld1_64_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, false);
10351
10352 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10353 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10354 ld1_64_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10355
10356 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10357 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10358 ld1_64_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10359
10360 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10361 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10362 ld1_64_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10363
10364 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10365 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10366 ld1_64_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, true);
10367
10368 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10369 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10370 ld1_64_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10371
10372 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10373 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10374 ld1_64_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010375}
10376
Martyn Capewell72765d12020-03-23 14:25:53 +000010377TEST_SVE(sve_ldnt1) {
10378 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10379 START();
10380
10381 int data_size = kZRegMaxSizeInBytes * 16;
10382 uint8_t* data = new uint8_t[data_size];
10383 for (int i = 0; i < data_size; i++) {
10384 data[i] = i & 0xff;
10385 }
10386
10387 // Set the base half-way through the buffer so we can use negative indices.
10388 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10389 __ Ptrue(p0.VnB());
10390 __ Punpklo(p1.VnH(), p0.VnB());
10391 __ Punpklo(p2.VnH(), p1.VnB());
10392 __ Punpklo(p3.VnH(), p2.VnB());
10393 __ Punpklo(p4.VnH(), p3.VnB());
10394
10395 __ Mov(x1, 42);
10396 __ Ld1b(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10397 __ Ldnt1b(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10398
10399 __ Mov(x1, -21);
10400 __ Ld1h(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10401 __ Ldnt1h(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10402
10403 __ Mov(x1, 10);
10404 __ Ld1w(z4.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10405 __ Ldnt1w(z5.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10406
10407 __ Mov(x1, -5);
10408 __ Ld1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10409 __ Ldnt1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10410
10411 __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10412 __ Ldnt1b(z9.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10413
10414 __ Ld1h(z10.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10415 __ Ldnt1h(z11.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10416
10417 __ Ld1w(z12.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10418 __ Ldnt1w(z13.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10419
10420 __ Ld1d(z14.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10421 __ Ldnt1d(z15.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10422 END();
10423
10424 if (CAN_RUN()) {
10425 RUN();
10426 ASSERT_EQUAL_SVE(z0, z1);
10427 ASSERT_EQUAL_SVE(z2, z3);
10428 ASSERT_EQUAL_SVE(z4, z5);
10429 ASSERT_EQUAL_SVE(z6, z7);
10430 ASSERT_EQUAL_SVE(z8, z9);
10431 ASSERT_EQUAL_SVE(z10, z11);
10432 ASSERT_EQUAL_SVE(z12, z13);
10433 ASSERT_EQUAL_SVE(z14, z15);
10434 }
10435}
10436
Martyn Capewell3e2fb502020-03-24 12:04:07 +000010437TEST_SVE(sve_stnt1) {
10438 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10439 START();
10440
10441 int data_size = kZRegMaxSizeInBytes * 16;
10442 uint8_t* data = new uint8_t[data_size];
10443
10444 // Set the base half-way through the buffer so we can use negative indices.
10445 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10446 __ Ptrue(p0.VnB());
10447 __ Punpklo(p1.VnH(), p0.VnB());
10448 __ Punpklo(p2.VnH(), p1.VnB());
10449 __ Punpklo(p3.VnH(), p2.VnB());
10450 __ Punpklo(p4.VnH(), p3.VnB());
10451 __ Dup(z0.VnB(), 0x55);
10452 __ Index(z1.VnB(), 0, 1);
10453
10454 // Store with all-true and patterned predication, load back, and create a
10455 // reference value for later comparison.
10456 __ Rdvl(x1, 1);
10457 __ Stnt1b(z0.VnB(), p0, SVEMemOperand(x0, x1));
10458 __ Stnt1b(z1.VnB(), p1, SVEMemOperand(x0, 1, SVE_MUL_VL));
10459 __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1));
10460 __ Sel(z3.VnB(), p1, z1.VnB(), z0.VnB());
10461
10462 // Repeated, with wider elements and different offsets.
10463 __ Rdvl(x1, -1);
10464 __ Lsr(x1, x1, 1);
10465 __ Stnt1h(z0.VnH(), p0, SVEMemOperand(x0, x1, LSL, 1));
10466 __ Stnt1h(z1.VnH(), p2, SVEMemOperand(x0, -1, SVE_MUL_VL));
10467 __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10468 __ Sel(z5.VnH(), p2, z1.VnH(), z0.VnH());
10469
10470 __ Rdvl(x1, 7);
10471 __ Lsr(x1, x1, 2);
10472 __ Stnt1w(z0.VnS(), p0, SVEMemOperand(x0, x1, LSL, 2));
10473 __ Stnt1w(z1.VnS(), p3, SVEMemOperand(x0, 7, SVE_MUL_VL));
10474 __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10475 __ Sel(z7.VnS(), p3, z1.VnS(), z0.VnS());
10476
10477 __ Rdvl(x1, -8);
10478 __ Lsr(x1, x1, 3);
10479 __ Stnt1d(z0.VnD(), p0, SVEMemOperand(x0, x1, LSL, 3));
10480 __ Stnt1d(z1.VnD(), p4, SVEMemOperand(x0, -8, SVE_MUL_VL));
10481 __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10482 __ Sel(z9.VnD(), p4, z1.VnD(), z0.VnD());
10483 END();
10484
10485 if (CAN_RUN()) {
10486 RUN();
10487 ASSERT_EQUAL_SVE(z2, z3);
10488 ASSERT_EQUAL_SVE(z4, z5);
10489 ASSERT_EQUAL_SVE(z6, z7);
10490 ASSERT_EQUAL_SVE(z8, z9);
10491 }
10492}
10493
Martyn Capewell452ad8b2020-03-19 15:49:57 +000010494TEST_SVE(sve_ld1rq) {
10495 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10496 START();
10497
10498 int data_size = (kQRegSizeInBytes + 128) * 2;
10499 uint8_t* data = new uint8_t[data_size];
10500 for (int i = 0; i < data_size; i++) {
10501 data[i] = i & 0xff;
10502 }
10503
10504 // Set the base half-way through the buffer so we can use negative indices.
10505 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10506
10507 __ Index(z0.VnB(), 0, 1);
10508 __ Ptrue(p0.VnB());
10509 __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
10510 __ Pfalse(p1.VnB());
10511 __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
10512
10513 // Load and broadcast using scalar offsets.
10514 __ Mov(x1, -42);
10515 __ Ld1rqb(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10516
10517 __ Add(x2, x0, 1);
10518 __ Mov(x1, -21);
10519 __ Punpklo(p2.VnH(), p1.VnB());
10520 __ Ld1rqh(z1.VnH(), p2.Zeroing(), SVEMemOperand(x2, x1, LSL, 1));
10521
10522 __ Add(x2, x2, 1);
10523 __ Mov(x1, -10);
10524 __ Punpklo(p3.VnH(), p2.VnB());
10525 __ Ld1rqw(z2.VnS(), p3.Zeroing(), SVEMemOperand(x2, x1, LSL, 2));
10526
10527 __ Add(x2, x2, 1);
10528 __ Mov(x1, 5);
10529 __ Punpklo(p4.VnH(), p3.VnB());
10530 __ Ld1rqd(z3.VnD(), p4.Zeroing(), SVEMemOperand(x2, x1, LSL, 3));
10531
10532 // Check that all segments match by rotating the vector by one segment,
10533 // eoring, and orring across the vector.
10534 __ Ext(z4.VnB(), z0.VnB(), z0.VnB(), 16);
10535 __ Eor(z4.VnB(), z4.VnB(), z0.VnB());
10536 __ Orv(b4, p0, z4.VnB());
10537 __ Ext(z5.VnB(), z1.VnB(), z1.VnB(), 16);
10538 __ Eor(z5.VnB(), z5.VnB(), z1.VnB());
10539 __ Orv(b5, p0, z5.VnB());
10540 __ Orr(z4, z4, z5);
10541 __ Ext(z5.VnB(), z2.VnB(), z2.VnB(), 16);
10542 __ Eor(z5.VnB(), z5.VnB(), z2.VnB());
10543 __ Orv(b5, p0, z5.VnB());
10544 __ Orr(z4, z4, z5);
10545 __ Ext(z5.VnB(), z3.VnB(), z3.VnB(), 16);
10546 __ Eor(z5.VnB(), z5.VnB(), z3.VnB());
10547 __ Orv(b5, p0, z5.VnB());
10548 __ Orr(z4, z4, z5);
10549
10550 // Load and broadcast the same values, using immediate offsets.
10551 __ Add(x1, x0, 6);
10552 __ Ld1rqb(z5.VnB(), p1.Zeroing(), SVEMemOperand(x1, -48));
10553 __ Add(x1, x0, -9);
10554 __ Ld1rqh(z6.VnH(), p2.Zeroing(), SVEMemOperand(x1, -32));
10555 __ Add(x1, x0, -70);
10556 __ Ld1rqw(z7.VnS(), p3.Zeroing(), SVEMemOperand(x1, 32));
10557 __ Add(x1, x0, 27);
10558 __ Ld1rqd(z8.VnD(), p4.Zeroing(), SVEMemOperand(x1, 16));
10559 END();
10560
10561 if (CAN_RUN()) {
10562 RUN();
10563 uint64_t expected_z0[] = {0x0000000000000000, 0x006c006a00680066};
10564 uint64_t expected_z1[] = {0x000074730000706f, 0x00006c6b00006867};
10565 uint64_t expected_z2[] = {0x0000000075747372, 0x000000006d6c6b6a};
10566 uint64_t expected_z3[] = {0x0000000000000000, 0xc2c1c0bfbebdbcbb};
10567 uint64_t expected_z4[] = {0, 0};
10568 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
10569 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
10570 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
10571 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
10572 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
10573 ASSERT_EQUAL_SVE(z0, z5);
10574 ASSERT_EQUAL_SVE(z1, z6);
10575 ASSERT_EQUAL_SVE(z2, z7);
10576 ASSERT_EQUAL_SVE(z3, z8);
10577 }
10578}
10579
Martyn Capewellb56cf222020-05-05 17:38:28 +010010580TEST_SVE(sve_st1_vec_imm) {
10581 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
10582 START();
10583
10584 // TODO: Use mmap() to request a buffer in the low 4GB, which allows testing
10585 // 32-bit address vectors.
10586 int data_size = kZRegMaxSizeInBytes * 16;
10587 uint8_t* data = new uint8_t[data_size];
10588
10589 // Set the base to 16 bytes from the end of the buffer so we can use negative
10590 // indices.
10591 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size - 16]));
10592 __ Ptrue(p0.VnB());
10593
10594 // Store a vector of index values in reverse order, using
10595 // vector-plus-immediate addressing to begin at byte 15, then storing to
10596 // bytes 14, 13, etc.
10597 __ Index(z1.VnD(), x0, -1);
10598 __ Index(z2.VnD(), 0, 1);
10599
10600 // Iterate in order to store at least 16 bytes. The number of iterations
10601 // depends on VL, eg. VL128 iterates eight times, storing bytes 15 and 14
10602 // on the first iteration, 13 and 12 on the next, etc.
10603 uint64_t dlanes = config->sve_vl_in_bytes() / kDRegSizeInBytes;
10604 for (int i = 15; i >= 0; i -= dlanes * kBRegSizeInBytes) {
10605 __ St1b(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10606 __ Incd(z2.VnD());
10607 }
10608
10609 // Reload the stored data, and build a reference for comparison. The reference
10610 // is truncated to a Q register, as only the least-significant 128 bits are
10611 // checked.
10612 __ Ldr(q4, MemOperand(x0));
10613 __ Index(z5.VnB(), 15, -1);
10614 __ Mov(q5, q5);
10615
10616 // Repeat for wider elements.
10617 __ Index(z1.VnD(), x0, -2); // Stepping by -2 for H-sized elements.
10618 __ Index(z2.VnD(), 0, 1);
10619 for (int i = 14; i >= 0; i -= dlanes * kHRegSizeInBytes) {
10620 __ St1h(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10621 __ Incd(z2.VnD());
10622 }
10623 __ Ldr(q6, MemOperand(x0));
10624 __ Index(z7.VnH(), 7, -1);
10625 __ Mov(q7, q7);
10626
10627 __ Index(z1.VnD(), x0, -4); // Stepping by -4 for S-sized elements.
10628 __ Index(z2.VnD(), 0, 1);
10629 for (int i = 12; i >= 0; i -= dlanes * kSRegSizeInBytes) {
10630 __ St1w(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10631 __ Incd(z2.VnD());
10632 }
10633 __ Ldr(q8, MemOperand(x0));
10634 __ Index(z9.VnS(), 3, -1);
10635 __ Mov(q9, q9);
10636
10637 __ Index(z1.VnD(), x0, -8); // Stepping by -8 for D-sized elements.
10638 __ Index(z2.VnD(), 0, 1);
10639 for (int i = 8; i >= 0; i -= dlanes * kDRegSizeInBytes) {
10640 __ St1d(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10641 __ Incd(z2.VnD());
10642 }
10643 __ Ldr(q10, MemOperand(x0));
10644 __ Index(z11.VnD(), 1, -1);
10645 __ Mov(q11, q11);
10646
10647 // Test predication by storing even halfwords to memory (using predication)
10648 // at byte-separated addresses. The result should be the same as storing
10649 // even halfwords contiguously to memory.
10650 __ Pfalse(p1.VnB());
10651 __ Zip1(p1.VnD(), p0.VnD(), p1.VnD());
10652 __ Mov(x0, reinterpret_cast<uintptr_t>(data));
10653 __ Index(z1.VnD(), x0, 1);
10654 __ Index(z2.VnD(), 0x1000, 1);
10655 for (int i = 0; i < 16; i += dlanes) {
10656 __ St1h(z2.VnD(), p1, SVEMemOperand(z1.VnD(), i));
10657 __ Incd(z2.VnD());
10658 }
10659 __ Ldr(q2, MemOperand(x0));
10660 __ Index(z3.VnH(), 0x1000, 2);
10661 __ Mov(q3, q3);
10662
10663 END();
10664
10665 if (CAN_RUN()) {
10666 RUN();
10667
10668 ASSERT_EQUAL_SVE(z3, z2);
10669 ASSERT_EQUAL_SVE(z5, z4);
10670 ASSERT_EQUAL_SVE(z7, z6);
10671 ASSERT_EQUAL_SVE(z9, z8);
10672 ASSERT_EQUAL_SVE(z11, z10);
10673 }
10674}
10675
TatWai Chong5f3928c2020-06-11 00:09:20 -070010676template <typename T>
10677static void sve_st1_scalar_plus_vector_helper(Test* config,
10678 int esize_in_bits,
10679 T mod,
10680 bool is_scaled) {
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010681 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10682 START();
10683
10684 int vl = config->sve_vl_in_bytes();
10685 int data_size = vl * 160;
10686 uint8_t* data = new uint8_t[data_size];
10687 memset(data, 0, data_size);
TatWai Chong5f3928c2020-06-11 00:09:20 -070010688 int vl_per_esize = vl / (esize_in_bits / kBitsPerByte);
10689
10690 ZRegister zn_b = z0.WithLaneSize(esize_in_bits);
10691 ZRegister zn_h = z1.WithLaneSize(esize_in_bits);
10692 ZRegister zn_s = z2.WithLaneSize(esize_in_bits);
10693 ZRegister zn_d = z3.WithLaneSize(esize_in_bits);
10694
10695 ZRegister zn_ld_b = z10.WithLaneSize(esize_in_bits);
10696 ZRegister zn_ld_h = z11.WithLaneSize(esize_in_bits);
10697 ZRegister zn_ld_s = z12.WithLaneSize(esize_in_bits);
10698 ZRegister zn_ld_d = z13.WithLaneSize(esize_in_bits);
10699 ZRegister offsets = z31.WithLaneSize(esize_in_bits);
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010700
10701 // Set the base half-way through the buffer so we can use negative indices.
10702 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
TatWai Chong5f3928c2020-06-11 00:09:20 -070010703 __ Ptrue(p6.WithLaneSize(esize_in_bits));
10704 __ Pfalse(p7.WithLaneSize(esize_in_bits));
10705 __ Zip1(p0.WithLaneSize(esize_in_bits),
10706 p6.WithLaneSize(esize_in_bits),
10707 p7.WithLaneSize(esize_in_bits));
10708 __ Zip1(p1.WithLaneSize(esize_in_bits),
10709 p7.WithLaneSize(esize_in_bits),
10710 p6.WithLaneSize(esize_in_bits));
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010711
TatWai Chong5f3928c2020-06-11 00:09:20 -070010712 // `st1b` doesn't have the scaled-offset forms.
10713 if (is_scaled == false) {
10714 // Simply stepping the index by 2 to simulate a scatter memory access.
10715 __ Index(offsets, 1, 2);
10716 __ St1b(offsets, p0, SVEMemOperand(x0, offsets, mod));
10717 __ Ld1b(zn_ld_b, p0.Zeroing(), SVEMemOperand(x0, offsets, mod));
10718 __ Dup(zn_b, 0);
10719 __ Mov(zn_b, p0.Merging(), offsets);
10720 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010721
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010722 // Store the values to isolated range different with other stores.
TatWai Chong5f3928c2020-06-11 00:09:20 -070010723 int scale = is_scaled ? 1 : 0;
10724 __ Add(x1, x0, vl_per_esize * 4);
10725 __ Index(offsets, 6, 4);
10726 __ St1h(offsets, p0, SVEMemOperand(x1, offsets, mod, scale));
10727 __ Ld1h(zn_ld_h, p0.Zeroing(), SVEMemOperand(x1, offsets, mod, scale));
10728 __ Dup(zn_h, 0);
10729 __ Mov(zn_h, p0.Merging(), offsets);
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010730
TatWai Chong5f3928c2020-06-11 00:09:20 -070010731 scale = is_scaled ? 2 : 0;
10732 __ Add(x2, x0, UINT64_MAX + (vl_per_esize * -8) + 1);
10733 __ Index(offsets, 64, 8);
10734 if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10735 (static_cast<int>(mod) == SXTW)) {
10736 // Testing negative offsets.
10737 __ Neg(offsets, p6.Merging(), offsets);
10738 }
10739 __ St1w(offsets, p1, SVEMemOperand(x2, offsets, mod, scale));
10740 __ Ld1w(zn_ld_s, p1.Zeroing(), SVEMemOperand(x2, offsets, mod, scale));
10741 __ Dup(zn_s, 0);
10742 __ Mov(zn_s, p1.Merging(), offsets);
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010743
TatWai Chong5f3928c2020-06-11 00:09:20 -070010744 if (esize_in_bits == kDRegSize) {
10745 // Test st1w by comparing the 32-bit value loaded correspondingly with the
10746 // 32-bit value stored.
10747 __ Lsl(zn_s, zn_s, kSRegSize);
10748 __ Lsr(zn_s, zn_s, kSRegSize);
10749 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010750
TatWai Chong5f3928c2020-06-11 00:09:20 -070010751 // `st1d` doesn't have the S-sized lane forms.
10752 if (esize_in_bits == kDRegSize) {
10753 scale = is_scaled ? 3 : 0;
10754 __ Add(x3, x0, UINT64_MAX + (vl_per_esize * -16) + 1);
10755 __ Index(offsets, 128, 16);
10756 if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10757 (static_cast<int>(mod) == SXTW)) {
10758 __ Neg(offsets, p6.Merging(), offsets);
10759 }
10760 __ St1d(offsets, p1, SVEMemOperand(x3, offsets, mod, scale));
10761 __ Ld1d(zn_ld_d, p1.Zeroing(), SVEMemOperand(x3, offsets, mod, scale));
10762 __ Dup(zn_d, 0);
10763 __ Mov(zn_d, p1.Merging(), offsets);
10764 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010765
10766 END();
10767
10768 if (CAN_RUN()) {
10769 RUN();
10770
TatWai Chong5f3928c2020-06-11 00:09:20 -070010771 if (scale == false) {
10772 ASSERT_EQUAL_SVE(zn_ld_b, zn_b);
10773 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010774
TatWai Chong5f3928c2020-06-11 00:09:20 -070010775 ASSERT_EQUAL_SVE(zn_ld_h, zn_h);
10776 ASSERT_EQUAL_SVE(zn_ld_s, zn_s);
10777
10778 if (esize_in_bits == kDRegSize) {
10779 ASSERT_EQUAL_SVE(zn_ld_d, zn_d);
10780 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010781 }
10782
10783 delete[] data;
10784}
10785
TatWai Chong5f3928c2020-06-11 00:09:20 -070010786TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled) {
10787 sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, false);
10788 sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, false);
10789}
10790
10791TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled) {
10792 sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, true);
10793 sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, true);
10794}
10795
10796TEST_SVE(sve_st1_sca_vec_32_unscaled) {
10797 sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, false);
10798 sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, false);
10799}
10800
10801TEST_SVE(sve_st1_sca_vec_32_scaled) {
10802 sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, true);
10803 sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, true);
10804}
10805
10806TEST_SVE(sve_st1_sca_vec_64_scaled) {
10807 sve_st1_scalar_plus_vector_helper(config, kDRegSize, LSL, true);
10808}
10809
10810TEST_SVE(sve_st1_sca_vec_64_unscaled) {
10811 sve_st1_scalar_plus_vector_helper(config, kDRegSize, NO_SHIFT, false);
10812}
10813
TatWai Chong6995bfd2019-09-26 10:48:05 +010010814typedef void (MacroAssembler::*IntWideImmFn)(const ZRegister& zd,
10815 const ZRegister& zn,
10816 const IntegerOperand imm);
10817
10818template <typename F, typename Td, typename Tn>
10819static void IntWideImmHelper(Test* config,
10820 F macro,
10821 unsigned lane_size_in_bits,
10822 const Tn& zn_inputs,
10823 IntegerOperand imm,
10824 const Td& zd_expected) {
10825 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10826 START();
10827
10828 ZRegister zd1 = z0.WithLaneSize(lane_size_in_bits);
10829 InsrHelper(&masm, zd1, zn_inputs);
10830
10831 // Also test with a different zn, to test the movprfx case.
10832 ZRegister zn = z1.WithLaneSize(lane_size_in_bits);
10833 InsrHelper(&masm, zn, zn_inputs);
10834 ZRegister zd2 = z2.WithLaneSize(lane_size_in_bits);
10835 ZRegister zn_copy = z3.WithSameLaneSizeAs(zn);
10836
10837 // Make a copy so we can check that constructive operations preserve zn.
10838 __ Mov(zn_copy, zn);
10839
10840 {
10841 UseScratchRegisterScope temps(&masm);
10842 // The MacroAssembler needs a P scratch register for some of these macros,
10843 // and it doesn't have one by default.
10844 temps.Include(p3);
10845
10846 (masm.*macro)(zd1, zd1, imm);
10847 (masm.*macro)(zd2, zn, imm);
10848 }
10849
10850 END();
10851
10852 if (CAN_RUN()) {
10853 RUN();
10854
10855 ASSERT_EQUAL_SVE(zd_expected, zd1);
10856
10857 // Check the result from `instr` with movprfx is the same as
10858 // the immediate version.
10859 ASSERT_EQUAL_SVE(zd_expected, zd2);
10860
10861 ASSERT_EQUAL_SVE(zn_copy, zn);
10862 }
10863}
10864
10865TEST_SVE(sve_int_wide_imm_unpredicated_smax) {
10866 int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10867 int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10868 int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10869 int64_t in_d[] = {1, 10, 10000, 1000000};
10870
10871 IntWideImmFn fn = &MacroAssembler::Smax;
10872
10873 int exp_b_1[] = {0, -1, 127, -1, 126, 1, -1, 55};
10874 int exp_h_1[] = {127, 127, 127, 127, INT16_MAX, 127, 127, 5555};
10875 int exp_s_1[] = {0, -128, 127, -128, INT32_MAX, 1, -1, 555555};
10876 int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10877
10878 IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10879 IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10880 IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10881 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10882
10883 int exp_h_2[] = {0, -128, 127, -255, INT16_MAX, 1, -1, 5555};
10884 int exp_s_2[] = {2048, 2048, 2048, 2048, INT32_MAX, 2048, 2048, 555555};
10885 int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10886
10887 // The immediate is in the range [-128, 127], but the macro is able to
10888 // synthesise unencodable immediates.
10889 // B-sized lanes cannot take an immediate out of the range [-128, 127].
10890 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10891 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10892 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10893}
10894
10895TEST_SVE(sve_int_wide_imm_unpredicated_smin) {
10896 int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10897 int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10898 int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10899 int64_t in_d[] = {1, 10, 10000, 1000000};
10900
10901 IntWideImmFn fn = &MacroAssembler::Smin;
10902
10903 int exp_b_1[] = {-1, -128, -1, -127, -1, -1, -1, -1};
10904 int exp_h_1[] = {0, -128, 127, INT16_MIN, 127, 1, -1, 127};
10905 int exp_s_1[] = {-128, -128, -128, INT32_MIN, -128, -128, -128, -128};
10906 int64_t exp_d_1[] = {1, 10, 99, 99};
10907
10908 IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10909 IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10910 IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10911 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10912
10913 int exp_h_2[] = {-255, -255, -255, INT16_MIN, -255, -255, -255, -255};
10914 int exp_s_2[] = {0, -128, 127, INT32_MIN, 2048, 1, -1, 2048};
10915 int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10916
10917 // The immediate is in the range [-128, 127], but the macro is able to
10918 // synthesise unencodable immediates.
10919 // B-sized lanes cannot take an immediate out of the range [-128, 127].
10920 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10921 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10922 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10923}
10924
10925TEST_SVE(sve_int_wide_imm_unpredicated_umax) {
10926 int in_b[] = {0, 255, 127, 0x80, 1, 55};
10927 int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10928 int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10929 int64_t in_d[] = {1, 10, 10000, 1000000};
10930
10931 IntWideImmFn fn = &MacroAssembler::Umax;
10932
10933 int exp_b_1[] = {17, 255, 127, 0x80, 17, 55};
10934 int exp_h_1[] = {127, 255, 127, INT16_MAX, 127, 5555};
10935 int exp_s_1[] = {255, 255, 255, INT32_MAX, 255, 555555};
10936 int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10937
10938 IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10939 IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10940 IntWideImmHelper(config, fn, kSRegSize, in_s, 0xff, exp_s_1);
10941 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10942
10943 int exp_h_2[] = {511, 511, 511, INT16_MAX, 511, 5555};
10944 int exp_s_2[] = {2048, 2048, 2048, INT32_MAX, 2048, 555555};
10945 int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10946
10947 // The immediate is in the range [0, 255], but the macro is able to
10948 // synthesise unencodable immediates.
10949 // B-sized lanes cannot take an immediate out of the range [0, 255].
10950 IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10951 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10952 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10953}
10954
10955TEST_SVE(sve_int_wide_imm_unpredicated_umin) {
10956 int in_b[] = {0, 255, 127, 0x80, 1, 55};
10957 int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10958 int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10959 int64_t in_d[] = {1, 10, 10000, 1000000};
10960
10961 IntWideImmFn fn = &MacroAssembler::Umin;
10962
10963 int exp_b_1[] = {0, 17, 17, 17, 1, 17};
10964 int exp_h_1[] = {0, 127, 127, 127, 1, 127};
10965 int exp_s_1[] = {0, 255, 127, 255, 1, 255};
10966 int64_t exp_d_1[] = {1, 10, 99, 99};
10967
10968 IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10969 IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10970 IntWideImmHelper(config, fn, kSRegSize, in_s, 255, exp_s_1);
10971 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10972
10973 int exp_h_2[] = {0, 255, 127, 511, 1, 511};
10974 int exp_s_2[] = {0, 255, 127, 2048, 1, 2048};
10975 int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10976
10977 // The immediate is in the range [0, 255], but the macro is able to
10978 // synthesise unencodable immediates.
10979 // B-sized lanes cannot take an immediate out of the range [0, 255].
10980 IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10981 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10982 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10983}
10984
10985TEST_SVE(sve_int_wide_imm_unpredicated_mul) {
10986 int in_b[] = {11, -1, 7, -3};
10987 int in_h[] = {111, -1, 17, -123};
10988 int in_s[] = {11111, -1, 117, -12345};
10989 int64_t in_d[] = {0x7fffffff, 0x80000000};
10990
10991 IntWideImmFn fn = &MacroAssembler::Mul;
10992
10993 int exp_b_1[] = {66, -6, 42, -18};
10994 int exp_h_1[] = {-14208, 128, -2176, 15744};
10995 int exp_s_1[] = {11111 * 127, -127, 117 * 127, -12345 * 127};
10996 int64_t exp_d_1[] = {0xfffffffe, 0x100000000};
10997
10998 IntWideImmHelper(config, fn, kBRegSize, in_b, 6, exp_b_1);
10999 IntWideImmHelper(config, fn, kHRegSize, in_h, -128, exp_h_1);
11000 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11001 IntWideImmHelper(config, fn, kDRegSize, in_d, 2, exp_d_1);
11002
11003 int exp_h_2[] = {-28305, 255, -4335, 31365};
11004 int exp_s_2[] = {22755328, -2048, 239616, -25282560};
11005 int64_t exp_d_2[] = {0x00000063ffffff38, 0x0000006400000000};
11006
11007 // The immediate is in the range [-128, 127], but the macro is able to
11008 // synthesise unencodable immediates.
11009 // B-sized lanes cannot take an immediate out of the range [0, 255].
11010 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
11011 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
11012 IntWideImmHelper(config, fn, kDRegSize, in_d, 200, exp_d_2);
11013
11014 // Integer overflow on multiplication.
11015 unsigned exp_b_3[] = {0x75, 0x81, 0x79, 0x83};
11016
11017 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x7f, exp_b_3);
11018}
11019
11020TEST_SVE(sve_int_wide_imm_unpredicated_add) {
11021 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11022 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11023 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11024 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11025
11026 IntWideImmFn fn = &MacroAssembler::Add;
11027
11028 unsigned exp_b_1[] = {0x02, 0x00, 0x91, 0x80};
11029 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11030 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11031 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11032
11033 // Encodable with `add` (shift 0).
11034 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11035 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11036 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11037 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11038
11039 unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
11040 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11041 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11042
11043 // Encodable with `add` (shift 8).
11044 // B-sized lanes cannot take a shift of 8.
11045 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11046 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11047 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11048
11049 unsigned exp_s_3[] = {0x80808181, 0x807e7f7f, 0xab29aaaa, 0xf07ff0f0};
11050
11051 // The macro is able to synthesise unencodable immediates.
11052 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
Jacob Bramleyd9f929c2019-10-02 11:42:56 +010011053
11054 unsigned exp_b_4[] = {0x61, 0x5f, 0xf0, 0xdf};
11055 unsigned exp_h_4[] = {0x6181, 0x5f7f, 0xf010, 0x8aaa};
11056 unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11057 uint64_t exp_d_4[] = {0x8000000180018180, 0x7fffffff7fff7f7e};
11058
11059 // Negative immediates use `sub`.
11060 IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11061 IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11062 IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11063 IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011064}
11065
11066TEST_SVE(sve_int_wide_imm_unpredicated_sqadd) {
11067 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11068 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11069 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11070 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11071
11072 IntWideImmFn fn = &MacroAssembler::Sqadd;
11073
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011074 unsigned exp_b_1[] = {0x02, 0x7f, 0x7f, 0x7f};
TatWai Chong6995bfd2019-09-26 10:48:05 +010011075 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11076 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11077 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11078
11079 // Encodable with `sqadd` (shift 0).
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011080 // Note that encodable immediates are unsigned, even for signed saturation.
11081 IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011082 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11083 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011084 IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011085
11086 unsigned exp_h_2[] = {0x9181, 0x7fff, 0x2010, 0xbaaa};
11087 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11088 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11089
11090 // Encodable with `sqadd` (shift 8).
11091 // B-sized lanes cannot take a shift of 8.
11092 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11093 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11094 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011095}
11096
11097TEST_SVE(sve_int_wide_imm_unpredicated_uqadd) {
11098 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11099 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11100 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11101 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11102
11103 IntWideImmFn fn = &MacroAssembler::Uqadd;
11104
11105 unsigned exp_b_1[] = {0xff, 0xff, 0x91, 0xff};
11106 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11107 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11108 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11109
11110 // Encodable with `uqadd` (shift 0).
11111 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11112 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11113 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11114 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11115
11116 unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
11117 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11118 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11119
11120 // Encodable with `uqadd` (shift 8).
11121 // B-sized lanes cannot take a shift of 8.
11122 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11123 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11124 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011125}
11126
11127TEST_SVE(sve_int_wide_imm_unpredicated_sub) {
11128 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11129 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11130 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11131 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11132
11133 IntWideImmFn fn = &MacroAssembler::Sub;
11134
11135 unsigned exp_b_1[] = {0x00, 0xfe, 0x8f, 0x7e};
11136 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11137 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11138 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11139
11140 // Encodable with `sub` (shift 0).
11141 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11142 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11143 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11144 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11145
11146 unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11147 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11148 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11149
11150 // Encodable with `sub` (shift 8).
11151 // B-sized lanes cannot take a shift of 8.
11152 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11153 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11154 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11155
11156 unsigned exp_s_3[] = {0x7f828181, 0x7f807f7f, 0xaa2baaaa, 0xef81f0f0};
11157
11158 // The macro is able to synthesise unencodable immediates.
11159 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
Jacob Bramleyd9f929c2019-10-02 11:42:56 +010011160
11161 unsigned exp_b_4[] = {0xa1, 0x9f, 0x30, 0x1f};
11162 unsigned exp_h_4[] = {0xa181, 0x9f7f, 0x3010, 0xcaaa};
11163 unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11164 uint64_t exp_d_4[] = {0x8000000180018182, 0x7fffffff7fff7f80};
11165
11166 // Negative immediates use `add`.
11167 IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11168 IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11169 IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11170 IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011171}
11172
11173TEST_SVE(sve_int_wide_imm_unpredicated_sqsub) {
11174 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11175 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11176 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11177 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11178
11179 IntWideImmFn fn = &MacroAssembler::Sqsub;
11180
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011181 unsigned exp_b_1[] = {0x80, 0xfe, 0x8f, 0x80};
TatWai Chong6995bfd2019-09-26 10:48:05 +010011182 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11183 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11184 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11185
11186 // Encodable with `sqsub` (shift 0).
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011187 // Note that encodable immediates are unsigned, even for signed saturation.
11188 IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011189 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11190 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011191 IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011192
11193 unsigned exp_h_2[] = {0x8000, 0x6f7f, 0x0010, 0x9aaa};
11194 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11195 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11196
11197 // Encodable with `sqsub` (shift 8).
11198 // B-sized lanes cannot take a shift of 8.
11199 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11200 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11201 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011202}
11203
11204TEST_SVE(sve_int_wide_imm_unpredicated_uqsub) {
11205 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11206 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11207 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11208 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11209
11210 IntWideImmFn fn = &MacroAssembler::Uqsub;
11211
11212 unsigned exp_b_1[] = {0x00, 0x00, 0x00, 0x7e};
11213 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11214 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11215 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11216
11217 // Encodable with `uqsub` (shift 0).
11218 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11219 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11220 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11221 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11222
11223 unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11224 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11225 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11226
11227 // Encodable with `uqsub` (shift 8).
11228 // B-sized lanes cannot take a shift of 8.
11229 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11230 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11231 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011232}
11233
11234TEST_SVE(sve_int_wide_imm_unpredicated_subr) {
11235 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11236 START();
11237
11238 // Encodable with `subr` (shift 0).
11239 __ Index(z0.VnD(), 1, 1);
11240 __ Sub(z0.VnD(), 100, z0.VnD());
11241 __ Index(z1.VnS(), 0x7f, 1);
11242 __ Sub(z1.VnS(), 0xf7, z1.VnS());
11243 __ Index(z2.VnH(), 0xaaaa, 0x2222);
11244 __ Sub(z2.VnH(), 0x80, z2.VnH());
11245 __ Index(z3.VnB(), 133, 1);
11246 __ Sub(z3.VnB(), 255, z3.VnB());
11247
11248 // Encodable with `subr` (shift 8).
11249 __ Index(z4.VnD(), 256, -1);
11250 __ Sub(z4.VnD(), 42 * 256, z4.VnD());
11251 __ Index(z5.VnS(), 0x7878, 1);
11252 __ Sub(z5.VnS(), 0x8000, z5.VnS());
11253 __ Index(z6.VnH(), 0x30f0, -1);
11254 __ Sub(z6.VnH(), 0x7f00, z6.VnH());
11255 // B-sized lanes cannot take a shift of 8.
11256
11257 // Select with movprfx.
11258 __ Index(z31.VnD(), 256, 4001);
11259 __ Sub(z7.VnD(), 42 * 256, z31.VnD());
11260
11261 // Out of immediate encodable range of `sub`.
11262 __ Index(z30.VnS(), 0x11223344, 1);
11263 __ Sub(z8.VnS(), 0x88776655, z30.VnS());
11264
11265 END();
11266
11267 if (CAN_RUN()) {
11268 RUN();
11269
11270 int expected_z0[] = {87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
11271 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
11272
11273 int expected_z1[] = {0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78};
11274 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
11275
11276 int expected_z2[] = {0xab2c, 0xcd4e, 0xef70, 0x1192, 0x33b4, 0x55d6};
11277 ASSERT_EQUAL_SVE(expected_z2, z2.VnH());
11278
11279 int expected_z3[] = {0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a};
11280 ASSERT_EQUAL_SVE(expected_z3, z3.VnB());
11281
11282 int expected_z4[] = {10502, 10501, 10500, 10499, 10498, 10497, 10496};
11283 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
11284
11285 int expected_z5[] = {0x0783, 0x0784, 0x0785, 0x0786, 0x0787, 0x0788};
11286 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
11287
11288 int expected_z6[] = {0x4e15, 0x4e14, 0x4e13, 0x4e12, 0x4e11, 0x4e10};
11289 ASSERT_EQUAL_SVE(expected_z6, z6.VnH());
11290
11291 int expected_z7[] = {-13510, -9509, -5508, -1507, 2494, 6495, 10496};
11292 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
11293
11294 int expected_z8[] = {0x7755330e, 0x7755330f, 0x77553310, 0x77553311};
11295 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
11296 }
11297}
11298
11299TEST_SVE(sve_int_wide_imm_unpredicated_fdup) {
11300 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11301 START();
11302
11303 // Immediates which can be encoded in the instructions.
11304 __ Fdup(z0.VnH(), RawbitsToFloat16(0xc500));
11305 __ Fdup(z1.VnS(), Float16(2.0));
11306 __ Fdup(z2.VnD(), Float16(3.875));
11307 __ Fdup(z3.VnH(), 8.0f);
11308 __ Fdup(z4.VnS(), -4.75f);
11309 __ Fdup(z5.VnD(), 0.5f);
11310 __ Fdup(z6.VnH(), 1.0);
11311 __ Fdup(z7.VnS(), 2.125);
11312 __ Fdup(z8.VnD(), -13.0);
11313
11314 // Immediates which cannot be encoded in the instructions.
11315 __ Fdup(z10.VnH(), Float16(0.0));
11316 __ Fdup(z11.VnH(), kFP16PositiveInfinity);
11317 __ Fdup(z12.VnS(), 255.0f);
11318 __ Fdup(z13.VnS(), kFP32NegativeInfinity);
11319 __ Fdup(z14.VnD(), 12.3456);
11320 __ Fdup(z15.VnD(), kFP64PositiveInfinity);
11321
11322 END();
11323
11324 if (CAN_RUN()) {
11325 RUN();
11326
11327 ASSERT_EQUAL_SVE(0xc500, z0.VnH());
11328 ASSERT_EQUAL_SVE(0x40000000, z1.VnS());
11329 ASSERT_EQUAL_SVE(0x400f000000000000, z2.VnD());
11330 ASSERT_EQUAL_SVE(0x4800, z3.VnH());
11331 ASSERT_EQUAL_SVE(FloatToRawbits(-4.75f), z4.VnS());
11332 ASSERT_EQUAL_SVE(DoubleToRawbits(0.5), z5.VnD());
11333 ASSERT_EQUAL_SVE(0x3c00, z6.VnH());
11334 ASSERT_EQUAL_SVE(FloatToRawbits(2.125f), z7.VnS());
11335 ASSERT_EQUAL_SVE(DoubleToRawbits(-13.0), z8.VnD());
11336
11337 ASSERT_EQUAL_SVE(0x0000, z10.VnH());
11338 ASSERT_EQUAL_SVE(Float16ToRawbits(kFP16PositiveInfinity), z11.VnH());
11339 ASSERT_EQUAL_SVE(FloatToRawbits(255.0), z12.VnS());
11340 ASSERT_EQUAL_SVE(FloatToRawbits(kFP32NegativeInfinity), z13.VnS());
11341 ASSERT_EQUAL_SVE(DoubleToRawbits(12.3456), z14.VnD());
11342 ASSERT_EQUAL_SVE(DoubleToRawbits(kFP64PositiveInfinity), z15.VnD());
11343 }
11344}
11345
TatWai Chong6f111bc2019-10-07 09:20:37 +010011346TEST_SVE(sve_andv_eorv_orv) {
11347 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11348 START();
11349
11350 uint64_t in[] = {0x8899aabbccddeeff, 0x7777555533331111, 0x123456789abcdef0};
11351 InsrHelper(&masm, z31.VnD(), in);
11352
11353 // For simplicity, we re-use the same pg for various lane sizes.
11354 // For D lanes: 1, 1, 0
11355 // For S lanes: 1, 1, 1, 0, 0
11356 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
11357 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11358 Initialise(&masm, p0.VnB(), pg_in);
11359
11360 // Make a copy so we can check that constructive operations preserve zn.
11361 __ Mov(z0, z31);
11362 __ Andv(b0, p0, z0.VnB()); // destructive
11363 __ Andv(h1, p0, z31.VnH());
11364 __ Mov(z2, z31);
11365 __ Andv(s2, p0, z2.VnS()); // destructive
11366 __ Andv(d3, p0, z31.VnD());
11367
11368 __ Eorv(b4, p0, z31.VnB());
11369 __ Mov(z5, z31);
11370 __ Eorv(h5, p0, z5.VnH()); // destructive
11371 __ Eorv(s6, p0, z31.VnS());
11372 __ Mov(z7, z31);
11373 __ Eorv(d7, p0, z7.VnD()); // destructive
11374
11375 __ Mov(z8, z31);
11376 __ Orv(b8, p0, z8.VnB()); // destructive
11377 __ Orv(h9, p0, z31.VnH());
11378 __ Mov(z10, z31);
11379 __ Orv(s10, p0, z10.VnS()); // destructive
11380 __ Orv(d11, p0, z31.VnD());
11381
11382 END();
11383
11384 if (CAN_RUN()) {
11385 RUN();
11386
11387 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11388 ASSERT_EQUAL_64(0x10, d0);
11389 ASSERT_EQUAL_64(0x1010, d1);
11390 ASSERT_EQUAL_64(0x33331111, d2);
11391 ASSERT_EQUAL_64(0x7777555533331111, d3);
11392 ASSERT_EQUAL_64(0xbf, d4);
11393 ASSERT_EQUAL_64(0xedcb, d5);
11394 ASSERT_EQUAL_64(0x44444444, d6);
11395 ASSERT_EQUAL_64(0x7777555533331111, d7);
11396 ASSERT_EQUAL_64(0xff, d8);
11397 ASSERT_EQUAL_64(0xffff, d9);
11398 ASSERT_EQUAL_64(0x77775555, d10);
11399 ASSERT_EQUAL_64(0x7777555533331111, d11);
11400 } else {
11401 ASSERT_EQUAL_64(0, d0);
11402 ASSERT_EQUAL_64(0x0010, d1);
11403 ASSERT_EQUAL_64(0x00110011, d2);
11404 ASSERT_EQUAL_64(0x0011001100110011, d3);
11405 ASSERT_EQUAL_64(0x62, d4);
11406 ASSERT_EQUAL_64(0x0334, d5);
11407 ASSERT_EQUAL_64(0x8899aabb, d6);
11408 ASSERT_EQUAL_64(0xffeeffeeffeeffee, d7);
11409 ASSERT_EQUAL_64(0xff, d8);
11410 ASSERT_EQUAL_64(0xffff, d9);
11411 ASSERT_EQUAL_64(0xffffffff, d10);
11412 ASSERT_EQUAL_64(0xffffffffffffffff, d11);
11413 }
11414
11415 // Check the upper lanes above the top of the V register are all clear.
11416 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11417 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11418 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11419 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11420 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11421 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11422 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11423 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11424 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11425 ASSERT_EQUAL_SVE_LANE(0, z8.VnD(), i);
11426 ASSERT_EQUAL_SVE_LANE(0, z9.VnD(), i);
11427 ASSERT_EQUAL_SVE_LANE(0, z10.VnD(), i);
11428 ASSERT_EQUAL_SVE_LANE(0, z11.VnD(), i);
11429 }
11430 }
11431}
11432
TatWai Chongb2d8d1f2019-10-21 15:19:31 -070011433
11434TEST_SVE(sve_saddv_uaddv) {
11435 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11436 START();
11437
11438 uint64_t in[] = {0x8899aabbccddeeff, 0x8182838485868788, 0x0807060504030201};
11439 InsrHelper(&masm, z31.VnD(), in);
11440
11441 // For simplicity, we re-use the same pg for various lane sizes.
11442 // For D lanes: 1, 1, 0
11443 // For S lanes: 1, 1, 1, 0, 0
11444 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
11445 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11446 Initialise(&masm, p0.VnB(), pg_in);
11447
11448 // Make a copy so we can check that constructive operations preserve zn.
11449 __ Mov(z0, z31);
11450 __ Saddv(b0, p0, z0.VnB()); // destructive
11451 __ Saddv(h1, p0, z31.VnH());
11452 __ Mov(z2, z31);
11453 __ Saddv(s2, p0, z2.VnS()); // destructive
11454
11455 __ Uaddv(b4, p0, z31.VnB());
11456 __ Mov(z5, z31);
11457 __ Uaddv(h5, p0, z5.VnH()); // destructive
11458 __ Uaddv(s6, p0, z31.VnS());
11459 __ Mov(z7, z31);
11460 __ Uaddv(d7, p0, z7.VnD()); // destructive
11461
11462 END();
11463
11464 if (CAN_RUN()) {
11465 RUN();
11466
11467 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11468 // Saddv
11469 ASSERT_EQUAL_64(0xfffffffffffffda9, d0);
11470 ASSERT_EQUAL_64(0xfffffffffffe9495, d1);
11471 ASSERT_EQUAL_64(0xffffffff07090b0c, d2);
11472 // Uaddv
11473 ASSERT_EQUAL_64(0x00000000000002a9, d4);
11474 ASSERT_EQUAL_64(0x0000000000019495, d5);
11475 ASSERT_EQUAL_64(0x0000000107090b0c, d6);
11476 ASSERT_EQUAL_64(0x8182838485868788, d7);
11477 } else {
11478 // Saddv
11479 ASSERT_EQUAL_64(0xfffffffffffffd62, d0);
11480 ASSERT_EQUAL_64(0xfffffffffffe8394, d1);
11481 ASSERT_EQUAL_64(0xfffffffed3e6fa0b, d2);
11482 // Uaddv
11483 ASSERT_EQUAL_64(0x0000000000000562, d4);
11484 ASSERT_EQUAL_64(0x0000000000028394, d5);
11485 ASSERT_EQUAL_64(0x00000001d3e6fa0b, d6);
11486 ASSERT_EQUAL_64(0x0a1c2e4052647687, d7);
11487 }
11488
11489 // Check the upper lanes above the top of the V register are all clear.
11490 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11491 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11492 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11493 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11494 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11495 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11496 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11497 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11498 }
11499 }
11500}
11501
11502
11503TEST_SVE(sve_sminv_uminv) {
11504 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11505 START();
11506
11507 uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11508 InsrHelper(&masm, z31.VnD(), in);
11509
11510 // For simplicity, we re-use the same pg for various lane sizes.
11511 // For D lanes: 1, 0, 1
11512 // For S lanes: 1, 1, 0, 0, 1
11513 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
11514 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11515 Initialise(&masm, p0.VnB(), pg_in);
11516
11517 // Make a copy so we can check that constructive operations preserve zn.
11518 __ Mov(z0, z31);
11519 __ Sminv(b0, p0, z0.VnB()); // destructive
11520 __ Sminv(h1, p0, z31.VnH());
11521 __ Mov(z2, z31);
11522 __ Sminv(s2, p0, z2.VnS()); // destructive
11523 __ Sminv(d3, p0, z31.VnD());
11524
11525 __ Uminv(b4, p0, z31.VnB());
11526 __ Mov(z5, z31);
11527 __ Uminv(h5, p0, z5.VnH()); // destructive
11528 __ Uminv(s6, p0, z31.VnS());
11529 __ Mov(z7, z31);
11530 __ Uminv(d7, p0, z7.VnD()); // destructive
11531
11532 END();
11533
11534 if (CAN_RUN()) {
11535 RUN();
11536
11537 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11538 // Sminv
11539 ASSERT_EQUAL_64(0xaa, d0);
11540 ASSERT_EQUAL_64(0xaabb, d1);
11541 ASSERT_EQUAL_64(0xaabbfc00, d2);
11542 ASSERT_EQUAL_64(0x00112233aabbfc00, d3); // The smaller lane is inactive.
11543 // Uminv
11544 ASSERT_EQUAL_64(0, d4);
11545 ASSERT_EQUAL_64(0x2233, d5);
11546 ASSERT_EQUAL_64(0x112233, d6);
11547 ASSERT_EQUAL_64(0x00112233aabbfc00, d7); // The smaller lane is inactive.
11548 } else {
11549 // Sminv
11550 ASSERT_EQUAL_64(0xaa, d0);
11551 ASSERT_EQUAL_64(0xaaaa, d1);
11552 ASSERT_EQUAL_64(0xaaaaaaaa, d2);
11553 ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d3);
11554 // Uminv
11555 ASSERT_EQUAL_64(0, d4);
11556 ASSERT_EQUAL_64(0x2233, d5);
11557 ASSERT_EQUAL_64(0x112233, d6);
11558 ASSERT_EQUAL_64(0x00112233aabbfc00, d7);
11559 }
11560
11561 // Check the upper lanes above the top of the V register are all clear.
11562 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11563 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11564 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11565 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11566 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11567 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11568 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11569 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11570 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11571 }
11572 }
11573}
11574
11575TEST_SVE(sve_smaxv_umaxv) {
11576 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11577 START();
11578
11579 uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11580 InsrHelper(&masm, z31.VnD(), in);
11581
11582 // For simplicity, we re-use the same pg for various lane sizes.
11583 // For D lanes: 1, 0, 1
11584 // For S lanes: 1, 1, 0, 0, 1
11585 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
11586 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11587 Initialise(&masm, p0.VnB(), pg_in);
11588
11589 // Make a copy so we can check that constructive operations preserve zn.
11590 __ Mov(z0, z31);
11591 __ Smaxv(b0, p0, z0.VnB()); // destructive
11592 __ Smaxv(h1, p0, z31.VnH());
11593 __ Mov(z2, z31);
11594 __ Smaxv(s2, p0, z2.VnS()); // destructive
11595 __ Smaxv(d3, p0, z31.VnD());
11596
11597 __ Umaxv(b4, p0, z31.VnB());
11598 __ Mov(z5, z31);
11599 __ Umaxv(h5, p0, z5.VnH()); // destructive
11600 __ Umaxv(s6, p0, z31.VnS());
11601 __ Mov(z7, z31);
11602 __ Umaxv(d7, p0, z7.VnD()); // destructive
11603
11604 END();
11605
11606 if (CAN_RUN()) {
11607 RUN();
11608
11609 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11610 // Smaxv
11611 ASSERT_EQUAL_64(0x33, d0);
11612 ASSERT_EQUAL_64(0x44aa, d1);
11613 ASSERT_EQUAL_64(0x112233, d2);
11614 ASSERT_EQUAL_64(0x112233aabbfc00, d3);
11615 // Umaxv
11616 ASSERT_EQUAL_64(0xfe, d4);
11617 ASSERT_EQUAL_64(0xfc00, d5);
11618 ASSERT_EQUAL_64(0xaabbfc00, d6);
11619 ASSERT_EQUAL_64(0x112233aabbfc00, d7);
11620 } else {
11621 // Smaxv
11622 ASSERT_EQUAL_64(0x33, d0);
11623 ASSERT_EQUAL_64(0x44aa, d1);
11624 ASSERT_EQUAL_64(0x112233, d2);
11625 ASSERT_EQUAL_64(0x00112233aabbfc00, d3);
11626 // Umaxv
11627 ASSERT_EQUAL_64(0xfe, d4);
11628 ASSERT_EQUAL_64(0xfc00, d5);
11629 ASSERT_EQUAL_64(0xaabbfc00, d6);
11630 ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d7);
11631 }
11632
11633 // Check the upper lanes above the top of the V register are all clear.
11634 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11635 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11636 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11637 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11638 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11639 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11640 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11641 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11642 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11643 }
11644 }
11645}
11646
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011647template <typename T, size_t M, size_t N>
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011648static void SdotUdotHelper(Test* config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011649 unsigned lane_size_in_bits,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011650 const T (&zd_inputs)[M],
11651 const T (&za_inputs)[M],
11652 const T (&zn_inputs)[N],
11653 const T (&zm_inputs)[N],
11654 const T (&zd_expected)[M],
11655 const T (&zdnm_expected)[M],
11656 bool is_signed,
11657 int index = -1) {
11658 VIXL_STATIC_ASSERT(N == (M * 4));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011659 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11660 START();
11661
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011662 auto dot_fn = [&](const ZRegister& zd,
11663 const ZRegister& za,
11664 const ZRegister& zn,
11665 const ZRegister& zm,
11666 bool is_signed,
11667 int index) {
11668 if (is_signed) {
11669 if (index < 0) {
11670 __ Sdot(zd, za, zn, zm);
11671 } else {
11672 __ Sdot(zd, za, zn, zm, index);
11673 }
11674 } else {
11675 if (index < 0) {
11676 __ Udot(zd, za, zn, zm);
11677 } else {
11678 __ Udot(zd, za, zn, zm, index);
11679 }
11680 }
11681 };
11682
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011683 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
11684 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
11685 ZRegister zn = z2.WithLaneSize(lane_size_in_bits / 4);
11686 ZRegister zm = z3.WithLaneSize(lane_size_in_bits / 4);
11687
11688 InsrHelper(&masm, zd, zd_inputs);
11689 InsrHelper(&masm, za, za_inputs);
11690 InsrHelper(&masm, zn, zn_inputs);
11691 InsrHelper(&masm, zm, zm_inputs);
11692
11693 // The Dot macro handles arbitrarily-aliased registers in the argument list.
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011694 ZRegister dm_result = z4.WithLaneSize(lane_size_in_bits);
11695 ZRegister dnm_result = z5.WithLaneSize(lane_size_in_bits);
11696 ZRegister da_result = z6.WithLaneSize(lane_size_in_bits);
11697 ZRegister dn_result = z7.WithLaneSize(lane_size_in_bits);
11698 ZRegister d_result = z8.WithLaneSize(lane_size_in_bits);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011699
11700 __ Mov(da_result, za);
11701 // zda = zda + (zn . zm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011702 dot_fn(da_result, da_result, zn, zm, is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011703
TatWai Chong50ef1712020-06-19 05:47:44 -070011704 __ Mov(dn_result, zn.WithSameLaneSizeAs(dn_result));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011705 // zdn = za + (zdn . zm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011706 dot_fn(dn_result, za, dn_result.WithSameLaneSizeAs(zn), zm, is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011707
TatWai Chong50ef1712020-06-19 05:47:44 -070011708 __ Mov(dm_result, zm.WithSameLaneSizeAs(dm_result));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011709 // zdm = za + (zn . zdm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011710 dot_fn(dm_result, za, zn, dm_result.WithSameLaneSizeAs(zm), is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011711
11712 __ Mov(d_result, zd);
11713 // zd = za + (zn . zm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011714 dot_fn(d_result, za, zn, zm, is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011715
TatWai Chong50ef1712020-06-19 05:47:44 -070011716 __ Mov(dnm_result, zn.WithSameLaneSizeAs(dnm_result));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011717 // zdnm = za + (zdmn . zdnm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011718 dot_fn(dnm_result,
11719 za,
11720 dnm_result.WithSameLaneSizeAs(zn),
11721 dnm_result.WithSameLaneSizeAs(zm),
11722 is_signed,
11723 index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011724
11725 END();
11726
11727 if (CAN_RUN()) {
11728 RUN();
11729
11730 ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
11731 ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits / 4));
11732 ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits / 4));
11733
11734 ASSERT_EQUAL_SVE(zd_expected, da_result);
11735 ASSERT_EQUAL_SVE(zd_expected, dn_result);
11736 ASSERT_EQUAL_SVE(zd_expected, dm_result);
11737 ASSERT_EQUAL_SVE(zd_expected, d_result);
11738
11739 ASSERT_EQUAL_SVE(zdnm_expected, dnm_result);
11740 }
11741}
11742
11743TEST_SVE(sve_sdot) {
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011744 int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11745 int64_t za_inputs[] = {INT32_MAX, -3, 2};
11746 int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11747 int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011748
11749 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011750 int64_t zd_expected_s[] = {-2147418113, -183, 133}; // 0x8000ffff
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011751 int64_t zd_expected_d[] = {2147549183, -183, 133}; // 0x8000ffff
11752
11753 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011754 int64_t zdnm_expected_s[] = {-2147418113, 980, 572};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011755 int64_t zdnm_expected_d[] = {2147549183, 980, 572};
11756
11757 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011758 kSRegSize,
11759 zd_inputs,
11760 za_inputs,
11761 zn_inputs,
11762 zm_inputs,
11763 zd_expected_s,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011764 zdnm_expected_s,
11765 true);
11766
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011767 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011768 kDRegSize,
11769 zd_inputs,
11770 za_inputs,
11771 zn_inputs,
11772 zm_inputs,
11773 zd_expected_d,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011774 zdnm_expected_d,
11775 true);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011776}
11777
11778TEST_SVE(sve_udot) {
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011779 int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11780 int64_t za_inputs[] = {INT32_MAX, -3, 2};
11781 int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11782 int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011783
11784 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011785 int64_t zd_expected_s[] = {0x8000ffff, 0x00001749, 0x0000f085};
11786 int64_t zd_expected_d[] = {0x000000047c00ffff,
11787 0x000000000017ff49,
11788 0x00000000fff00085};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011789
11790 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011791 int64_t zdnm_expected_s[] = {0x8000ffff, 0x000101d4, 0x0001d03c};
11792 int64_t zdnm_expected_d[] = {0x000000047c00ffff,
11793 0x00000000fffe03d4,
11794 0x00000001ffce023c};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011795
11796 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011797 kSRegSize,
11798 zd_inputs,
11799 za_inputs,
11800 zn_inputs,
11801 zm_inputs,
11802 zd_expected_s,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011803 zdnm_expected_s,
11804 false);
11805
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011806 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011807 kDRegSize,
11808 zd_inputs,
11809 za_inputs,
11810 zn_inputs,
11811 zm_inputs,
11812 zd_expected_d,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011813 zdnm_expected_d,
11814 false);
11815}
11816
11817TEST_SVE(sve_sdot_indexed_s) {
11818 int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11819 int64_t za_inputs[] = {0, 1, 2, 3};
11820 int64_t zn_inputs[] =
11821 {-1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4};
11822 int64_t zm_inputs[] =
11823 {127, 127, 127, 127, -128, -128, -128, -128, -1, -1, -1, -1, 0, 0, 0, 0};
11824
11825 constexpr int s = kQRegSize / kSRegSize;
11826
11827 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11828 int64_t zd_expected_s[][s] = {{0, 1, 2, 3}, // Generated from zm[0]
11829 {4, 9, 14, 19},
11830 {512, 1025, 1538, 2051},
11831 {-508, -1015, -1522, -2029}};
11832
11833 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11834 int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11835 {12, 25, 38, 51},
11836 {8, 17, 26, 35},
11837 {4, 9, 14, 19}};
11838
11839 for (unsigned i = 0; i < s; i++) {
11840 SdotUdotHelper(config,
11841 kSRegSize,
11842 zd_inputs,
11843 za_inputs,
11844 zn_inputs,
11845 zm_inputs,
11846 zd_expected_s[i],
11847 zdnm_expected_s[i],
11848 true,
11849 i);
11850 }
11851}
11852
11853TEST_SVE(sve_sdot_indexed_d) {
11854 int64_t zd_inputs[] = {0xff, 0xff};
11855 int64_t za_inputs[] = {0, 1};
11856 int64_t zn_inputs[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11857 int64_t zm_inputs[] = {-128, -128, -128, -128, 127, 127, 127, 127};
11858
11859 constexpr int d = kQRegSize / kDRegSize;
11860
11861 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11862 int64_t zd_expected_d[][d] = {{-508, -507}, // Generated from zm[0]
11863 {512, 513}};
11864
11865 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11866 int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11867
11868 for (unsigned i = 0; i < d; i++) {
11869 SdotUdotHelper(config,
11870 kDRegSize,
11871 zd_inputs,
11872 za_inputs,
11873 zn_inputs,
11874 zm_inputs,
11875 zd_expected_d[i],
11876 zdnm_expected_d[i],
11877 true,
11878 i);
11879 }
11880}
11881
11882TEST_SVE(sve_udot_indexed_s) {
11883 int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11884 int64_t za_inputs[] = {0, 1, 2, 3};
11885 int64_t zn_inputs[] = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4};
11886 int64_t zm_inputs[] =
11887 {127, 127, 127, 127, 255, 255, 255, 255, 1, 1, 1, 1, 0, 0, 0, 0};
11888
11889 constexpr int s = kQRegSize / kSRegSize;
11890
11891 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11892 int64_t zd_expected_s[][s] = {{0, 1, 2, 3},
11893 {4, 9, 14, 19},
11894 {1020, 2041, 3062, 4083},
11895 {508, 1017, 1526, 2035}};
11896
11897 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11898 int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11899 {12, 25, 38, 51},
11900 {8, 17, 26, 35},
11901 {4, 9, 14, 19}};
11902
11903 for (unsigned i = 0; i < s; i++) {
11904 SdotUdotHelper(config,
11905 kSRegSize,
11906 zd_inputs,
11907 za_inputs,
11908 zn_inputs,
11909 zm_inputs,
11910 zd_expected_s[i],
11911 zdnm_expected_s[i],
11912 false,
11913 i);
11914 }
11915}
11916
11917TEST_SVE(sve_udot_indexed_d) {
11918 int64_t zd_inputs[] = {0xff, 0xff};
11919 int64_t za_inputs[] = {0, 1};
11920 int64_t zn_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1};
11921 int64_t zm_inputs[] = {255, 255, 255, 255, 127, 127, 127, 127};
11922
11923 constexpr int d = kQRegSize / kDRegSize;
11924
11925 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11926 int64_t zd_expected_d[][d] = {{508, 509}, {1020, 1021}};
11927
11928 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11929 int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11930
11931 for (unsigned i = 0; i < d; i++) {
11932 SdotUdotHelper(config,
11933 kDRegSize,
11934 zd_inputs,
11935 za_inputs,
11936 zn_inputs,
11937 zm_inputs,
11938 zd_expected_d[i],
11939 zdnm_expected_d[i],
11940 false,
11941 i);
11942 }
11943}
11944
11945static void IntSegmentPatternHelper(MacroAssembler* masm,
11946 const ZRegister& dst,
11947 const ZRegister& src) {
11948 VIXL_ASSERT(AreSameLaneSize(dst, src));
11949 UseScratchRegisterScope temps(masm);
11950 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
11951 masm->Index(ztmp, 0, 1);
11952 masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
11953 masm->Add(dst, src, ztmp);
11954}
11955
11956TEST_SVE(sve_sdot_udot_indexed_s) {
11957 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11958 START();
11959
11960 const int multiplier = 2;
11961 __ Dup(z9.VnS(), multiplier);
11962
11963 __ Ptrue(p0.VnB());
11964 __ Index(z29.VnS(), 4, 1);
11965
11966 // z29 = [... 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]
11967 __ And(z29.VnS(), z29.VnS(), 3);
11968
11969 // p7 = [... 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
11970 __ Cmple(p7.VnS(), p0.Zeroing(), z29.VnS(), 0);
11971
11972 // p6 = [... 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
11973 __ Cmple(p6.VnS(), p0.Zeroing(), z29.VnS(), 1);
11974
11975 // p5 = [... 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]
11976 __ Cmple(p5.VnS(), p0.Zeroing(), z29.VnS(), 2);
11977
11978 __ Index(z28.VnB(), 1, 1);
11979 __ Dup(z27.VnS(), z28.VnS(), 0);
11980
11981 // z27 = [... 3, 2, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1]
11982 IntSegmentPatternHelper(&masm, z27.VnB(), z27.VnB());
11983
11984 // z27 = [... 6, 4, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2]
11985 __ Mul(z27.VnS(), p7.Merging(), z27.VnS(), z9.VnS());
11986
11987 // z27 = [... 12, 8, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4]
11988 __ Mul(z27.VnS(), p6.Merging(), z27.VnS(), z9.VnS());
11989
11990 // 2nd segment | 1st segment |
11991 // v v
11992 // z27 = [... 24, 16, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4, 32, 24, 16, 8]
11993 __ Mul(z27.VnS(), p5.Merging(), z27.VnS(), z9.VnS());
11994
11995 __ Dup(z0.VnS(), 0);
11996 __ Dup(z1.VnS(), 0);
11997 __ Dup(z2.VnS(), 0);
11998 __ Dup(z3.VnS(), 0);
11999 __ Dup(z4.VnS(), 0);
12000 __ Dup(z5.VnS(), 0);
12001
12002 // Skip the lanes starting from the 129th lane since the value of these lanes
12003 // are overflow after the number sequence creation by `index`.
12004 __ Cmpls(p3.VnB(), p0.Zeroing(), z28.VnB(), 128);
12005 __ Mov(z0.VnB(), p3.Merging(), z27.VnB());
12006 __ Mov(z1.VnB(), p3.Merging(), z28.VnB());
12007
12008 __ Dup(z2.VnS(), 0);
12009 __ Dup(z3.VnS(), 0);
12010 __ Dup(z4.VnS(), 0);
12011 __ Dup(z5.VnS(), 0);
12012
12013 __ Udot(z2.VnS(), z2.VnS(), z1.VnB(), z0.VnB(), 0);
12014
12015 __ Udot(z3.VnS(), z3.VnS(), z1.VnB(), z0.VnB(), 1);
12016 __ Mul(z3.VnS(), z3.VnS(), 2);
12017
12018 __ Udot(z4.VnS(), z4.VnS(), z1.VnB(), z0.VnB(), 2);
12019 __ Mul(z4.VnS(), z4.VnS(), 4);
12020
12021 __ Udot(z5.VnS(), z5.VnS(), z1.VnB(), z0.VnB(), 3);
12022 __ Mul(z5.VnS(), z5.VnS(), 8);
12023
12024 __ Dup(z7.VnS(), 0);
12025 __ Dup(z8.VnS(), 0);
12026 __ Dup(z9.VnS(), 0);
12027 __ Dup(z10.VnS(), 0);
12028
12029 // Negate the all positive vector for testing signed dot.
12030 __ Neg(z6.VnB(), p0.Merging(), z0.VnB());
12031 __ Sdot(z7.VnS(), z7.VnS(), z1.VnB(), z6.VnB(), 0);
12032
12033 __ Sdot(z8.VnS(), z8.VnS(), z1.VnB(), z6.VnB(), 1);
12034 __ Mul(z8.VnS(), z8.VnS(), 2);
12035
12036 __ Sdot(z9.VnS(), z9.VnS(), z1.VnB(), z6.VnB(), 2);
12037 __ Mul(z9.VnS(), z9.VnS(), 4);
12038
12039 __ Sdot(z10.VnS(), z10.VnS(), z1.VnB(), z6.VnB(), 3);
12040 __ Mul(z10.VnS(), z10.VnS(), 8);
12041
12042 END();
12043
12044 if (CAN_RUN()) {
12045 RUN();
12046
12047 // Only compare the first 128-bit segment of destination register, use
12048 // another result from generated instructions to check the remaining part.
12049 // s_lane[0] = (1 * 8) + (2 * 16) + (3 * 24) + (4 * 32) = 240
12050 // ...
12051 // s_lane[3] = (13 * 8) + (14 * 16) + (15 * 24) + (16 * 32) = 1200
12052 int udot_expected[] = {1200, 880, 560, 240};
12053 ASSERT_EQUAL_SVE(udot_expected, z2.VnS());
12054 ASSERT_EQUAL_SVE(z2.VnS(), z3.VnS());
12055 ASSERT_EQUAL_SVE(z2.VnS(), z4.VnS());
12056 ASSERT_EQUAL_SVE(z2.VnS(), z5.VnS());
12057
12058 int sdot_expected[] = {-1200, -880, -560, -240};
12059 ASSERT_EQUAL_SVE(sdot_expected, z7.VnS());
12060 ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
12061 ASSERT_EQUAL_SVE(z7.VnS(), z9.VnS());
12062 ASSERT_EQUAL_SVE(z7.VnS(), z10.VnS());
12063 }
12064}
12065
12066TEST_SVE(sve_sdot_udot_indexed_d) {
12067 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12068 START();
12069
12070 const int multiplier = 2;
12071 __ Dup(z9.VnD(), multiplier);
12072
12073 __ Ptrue(p0.VnD());
12074 __ Pfalse(p1.VnD());
12075
12076 // p2 = [..., 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
12077 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
12078
12079 __ Index(z1.VnH(), 1, 1);
12080 __ Dup(z0.VnD(), z1.VnD(), 0);
12081
12082 // z0 = [... 5, 4, 3, 2, 5, 4, 3, 2, 4, 3, 2, 1, 4, 3, 2, 1]
12083 IntSegmentPatternHelper(&masm, z0.VnH(), z0.VnH());
12084
12085 // 2nd segment | 1st segment |
12086 // v v
12087 // z0 = [... 5, 4, 3, 2, 10, 8, 6, 4, 4, 3, 2, 1, 8, 6, 4, 2]
12088 __ Mul(z0.VnD(), p2.Merging(), z0.VnD(), z9.VnD());
12089
12090 __ Dup(z3.VnD(), 0);
12091 __ Dup(z4.VnD(), 0);
12092
12093 __ Udot(z3.VnD(), z3.VnD(), z1.VnH(), z0.VnH(), 0);
12094
12095 __ Udot(z4.VnD(), z4.VnD(), z1.VnH(), z0.VnH(), 1);
12096 __ Mul(z4.VnD(), z4.VnD(), multiplier);
12097
12098 __ Dup(z12.VnD(), 0);
12099 __ Dup(z13.VnD(), 0);
12100
12101 __ Ptrue(p4.VnH());
12102 __ Neg(z10.VnH(), p4.Merging(), z0.VnH());
12103
12104 __ Sdot(z12.VnD(), z12.VnD(), z1.VnH(), z10.VnH(), 0);
12105
12106 __ Sdot(z13.VnD(), z13.VnD(), z1.VnH(), z10.VnH(), 1);
12107 __ Mul(z13.VnD(), z13.VnD(), multiplier);
12108
12109 END();
12110
12111 if (CAN_RUN()) {
12112 RUN();
12113
12114 // Only compare the first 128-bit segment of destination register, use
12115 // another result from generated instructions to check the remaining part.
12116 // d_lane[0] = (1 * 2) + (2 * 4) + (3 * 6) + (4 * 8) = 60
12117 // d_lane[1] = (5 * 2) + (6 * 4) + (7 * 6) + (8 * 8) = 140
12118 uint64_t udot_expected[] = {416, 304, 140, 60};
12119 ASSERT_EQUAL_SVE(udot_expected, z3.VnD());
12120 ASSERT_EQUAL_SVE(z3.VnD(), z4.VnD());
12121
12122 int64_t sdot_expected[] = {-416, -304, -140, -60};
12123 ASSERT_EQUAL_SVE(sdot_expected, z12.VnD());
12124 ASSERT_EQUAL_SVE(z12.VnD(), z13.VnD());
12125 }
TatWai Chong4d2a4e92019-10-23 16:19:32 -070012126}
12127
TatWai Chong7a0d3672019-10-23 17:35:18 -070012128template <typename T, size_t N>
12129static void FPToRawbitsWithSize(const T (&inputs)[N],
12130 uint64_t* outputs,
12131 unsigned size_in_bits) {
TatWai Chongfe536042019-10-23 16:34:11 -070012132 for (size_t i = 0; i < N; i++) {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012133 outputs[i] = vixl::FPToRawbitsWithSize(size_in_bits, inputs[i]);
TatWai Chongfe536042019-10-23 16:34:11 -070012134 }
12135}
12136
TatWai Chong7a0d3672019-10-23 17:35:18 -070012137template <typename Ti, typename Te, size_t N>
12138static void FPBinArithHelper(Test* config,
12139 ArithFn macro,
12140 int lane_size_in_bits,
12141 const Ti (&zn_inputs)[N],
12142 const Ti (&zm_inputs)[N],
12143 const Te (&zd_expected)[N]) {
TatWai Chongfe536042019-10-23 16:34:11 -070012144 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chong7a0d3672019-10-23 17:35:18 -070012145
TatWai Chongfe536042019-10-23 16:34:11 -070012146 START();
12147
12148 ZRegister zd = z29.WithLaneSize(lane_size_in_bits);
12149 ZRegister zn = z30.WithLaneSize(lane_size_in_bits);
12150 ZRegister zm = z31.WithLaneSize(lane_size_in_bits);
12151
12152 uint64_t zn_rawbits[N];
12153 uint64_t zm_rawbits[N];
12154
TatWai Chong7a0d3672019-10-23 17:35:18 -070012155 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
12156 FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
TatWai Chongfe536042019-10-23 16:34:11 -070012157
12158 InsrHelper(&masm, zn, zn_rawbits);
12159 InsrHelper(&masm, zm, zm_rawbits);
12160
12161 (masm.*macro)(zd, zn, zm);
12162
12163 END();
12164
12165 if (CAN_RUN()) {
12166 RUN();
12167
12168 ASSERT_EQUAL_SVE(zd_expected, zd);
12169 }
12170}
12171
12172TEST_SVE(sve_fp_arithmetic_unpredicated_fadd) {
12173 double zn_inputs[] = {24.0,
12174 5.5,
12175 0.0,
12176 3.875,
12177 2.125,
12178 kFP64PositiveInfinity,
12179 kFP64NegativeInfinity};
12180
12181 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12182
TatWai Chong7a0d3672019-10-23 17:35:18 -070012183 ArithFn fn = &MacroAssembler::Fadd;
TatWai Chongfe536042019-10-23 16:34:11 -070012184
12185 uint16_t expected_h[] = {Float16ToRawbits(Float16(1048.0)),
12186 Float16ToRawbits(Float16(2053.5)),
12187 Float16ToRawbits(Float16(0.1)),
12188 Float16ToRawbits(Float16(-0.875)),
12189 Float16ToRawbits(Float16(14.465)),
12190 Float16ToRawbits(kFP16PositiveInfinity),
12191 Float16ToRawbits(kFP16NegativeInfinity)};
12192
TatWai Chong7a0d3672019-10-23 17:35:18 -070012193 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
TatWai Chongfe536042019-10-23 16:34:11 -070012194
12195 uint32_t expected_s[] = {FloatToRawbits(1048.0f),
12196 FloatToRawbits(2053.5f),
12197 FloatToRawbits(0.1f),
12198 FloatToRawbits(-0.875f),
12199 FloatToRawbits(14.465f),
12200 FloatToRawbits(kFP32PositiveInfinity),
12201 FloatToRawbits(kFP32NegativeInfinity)};
12202
TatWai Chong7a0d3672019-10-23 17:35:18 -070012203 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
TatWai Chongfe536042019-10-23 16:34:11 -070012204
12205 uint64_t expected_d[] = {DoubleToRawbits(1048.0),
12206 DoubleToRawbits(2053.5),
12207 DoubleToRawbits(0.1),
12208 DoubleToRawbits(-0.875),
12209 DoubleToRawbits(14.465),
12210 DoubleToRawbits(kFP64PositiveInfinity),
12211 DoubleToRawbits(kFP64NegativeInfinity)};
12212
TatWai Chong7a0d3672019-10-23 17:35:18 -070012213 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
TatWai Chongfe536042019-10-23 16:34:11 -070012214}
12215
12216TEST_SVE(sve_fp_arithmetic_unpredicated_fsub) {
12217 double zn_inputs[] = {24.0,
12218 5.5,
12219 0.0,
12220 3.875,
12221 2.125,
12222 kFP64PositiveInfinity,
12223 kFP64NegativeInfinity};
12224
12225 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12226
TatWai Chong7a0d3672019-10-23 17:35:18 -070012227 ArithFn fn = &MacroAssembler::Fsub;
TatWai Chongfe536042019-10-23 16:34:11 -070012228
12229 uint16_t expected_h[] = {Float16ToRawbits(Float16(-1000.0)),
12230 Float16ToRawbits(Float16(-2042.5)),
12231 Float16ToRawbits(Float16(-0.1)),
12232 Float16ToRawbits(Float16(8.625)),
12233 Float16ToRawbits(Float16(-10.215)),
12234 Float16ToRawbits(kFP16PositiveInfinity),
12235 Float16ToRawbits(kFP16NegativeInfinity)};
12236
TatWai Chong7a0d3672019-10-23 17:35:18 -070012237 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
TatWai Chongfe536042019-10-23 16:34:11 -070012238
12239 uint32_t expected_s[] = {FloatToRawbits(-1000.0),
12240 FloatToRawbits(-2042.5),
12241 FloatToRawbits(-0.1),
12242 FloatToRawbits(8.625),
12243 FloatToRawbits(-10.215),
12244 FloatToRawbits(kFP32PositiveInfinity),
12245 FloatToRawbits(kFP32NegativeInfinity)};
12246
TatWai Chong7a0d3672019-10-23 17:35:18 -070012247 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
TatWai Chongfe536042019-10-23 16:34:11 -070012248
12249 uint64_t expected_d[] = {DoubleToRawbits(-1000.0),
12250 DoubleToRawbits(-2042.5),
12251 DoubleToRawbits(-0.1),
12252 DoubleToRawbits(8.625),
12253 DoubleToRawbits(-10.215),
12254 DoubleToRawbits(kFP64PositiveInfinity),
12255 DoubleToRawbits(kFP64NegativeInfinity)};
12256
TatWai Chong7a0d3672019-10-23 17:35:18 -070012257 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
TatWai Chongfe536042019-10-23 16:34:11 -070012258}
12259
12260TEST_SVE(sve_fp_arithmetic_unpredicated_fmul) {
12261 double zn_inputs[] = {24.0,
12262 5.5,
12263 0.0,
12264 3.875,
12265 2.125,
12266 kFP64PositiveInfinity,
12267 kFP64NegativeInfinity};
12268
12269 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12270
TatWai Chong7a0d3672019-10-23 17:35:18 -070012271 ArithFn fn = &MacroAssembler::Fmul;
TatWai Chongfe536042019-10-23 16:34:11 -070012272
12273 uint16_t expected_h[] = {Float16ToRawbits(Float16(24576.0)),
12274 Float16ToRawbits(Float16(11264.0)),
12275 Float16ToRawbits(Float16(0.0)),
12276 Float16ToRawbits(Float16(-18.4)),
12277 Float16ToRawbits(Float16(26.23)),
12278 Float16ToRawbits(kFP16PositiveInfinity),
12279 Float16ToRawbits(kFP16PositiveInfinity)};
12280
TatWai Chong7a0d3672019-10-23 17:35:18 -070012281 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
TatWai Chongfe536042019-10-23 16:34:11 -070012282
12283 uint32_t expected_s[] = {FloatToRawbits(24576.0),
12284 FloatToRawbits(11264.0),
12285 FloatToRawbits(0.0),
12286 FloatToRawbits(-18.40625),
12287 FloatToRawbits(26.2225),
12288 FloatToRawbits(kFP32PositiveInfinity),
12289 FloatToRawbits(kFP32PositiveInfinity)};
12290
TatWai Chong7a0d3672019-10-23 17:35:18 -070012291 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
TatWai Chongfe536042019-10-23 16:34:11 -070012292
12293 uint64_t expected_d[] = {DoubleToRawbits(24576.0),
12294 DoubleToRawbits(11264.0),
12295 DoubleToRawbits(0.0),
12296 DoubleToRawbits(-18.40625),
12297 DoubleToRawbits(26.2225),
12298 DoubleToRawbits(kFP64PositiveInfinity),
12299 DoubleToRawbits(kFP64PositiveInfinity)};
12300
TatWai Chong7a0d3672019-10-23 17:35:18 -070012301 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
TatWai Chongfe536042019-10-23 16:34:11 -070012302}
12303
TatWai Chong7a0d3672019-10-23 17:35:18 -070012304typedef void (MacroAssembler::*FPArithPredicatedFn)(
12305 const ZRegister& zd,
12306 const PRegisterM& pg,
12307 const ZRegister& zn,
12308 const ZRegister& zm,
12309 FPMacroNaNPropagationOption nan_option);
12310
Martyn Capewell37f28182020-01-14 10:15:10 +000012311typedef void (MacroAssembler::*FPArithPredicatedNoNaNOptFn)(
12312 const ZRegister& zd,
12313 const PRegisterM& pg,
12314 const ZRegister& zn,
12315 const ZRegister& zm);
12316
TatWai Chong7a0d3672019-10-23 17:35:18 -070012317template <typename Ti, typename Te, size_t N>
12318static void FPBinArithHelper(
12319 Test* config,
12320 FPArithPredicatedFn macro,
Martyn Capewell37f28182020-01-14 10:15:10 +000012321 FPArithPredicatedNoNaNOptFn macro_nonan,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012322 unsigned lane_size_in_bits,
12323 const Ti (&zd_inputs)[N],
12324 const int (&pg_inputs)[N],
12325 const Ti (&zn_inputs)[N],
12326 const Ti (&zm_inputs)[N],
12327 const Te (&zd_expected)[N],
12328 FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
Martyn Capewell37f28182020-01-14 10:15:10 +000012329 VIXL_ASSERT((macro == NULL) ^ (macro_nonan == NULL));
TatWai Chongd316c5e2019-10-16 12:22:10 -070012330 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12331 START();
12332
TatWai Chong7a0d3672019-10-23 17:35:18 -070012333 // Avoid choosing default scratch registers.
12334 ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12335 ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12336 ZRegister zm = z28.WithLaneSize(lane_size_in_bits);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012337
TatWai Chong7a0d3672019-10-23 17:35:18 -070012338 uint64_t zn_inputs_rawbits[N];
12339 uint64_t zm_inputs_rawbits[N];
12340 uint64_t zd_inputs_rawbits[N];
TatWai Chongd316c5e2019-10-16 12:22:10 -070012341
TatWai Chong7a0d3672019-10-23 17:35:18 -070012342 FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
12343 FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
12344 FPToRawbitsWithSize(zd_inputs, zd_inputs_rawbits, lane_size_in_bits);
12345
12346 InsrHelper(&masm, zn, zn_inputs_rawbits);
12347 InsrHelper(&masm, zm, zm_inputs_rawbits);
12348 InsrHelper(&masm, zd, zd_inputs_rawbits);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012349
12350 PRegisterWithLaneSize pg = p0.WithLaneSize(lane_size_in_bits);
12351 Initialise(&masm, pg, pg_inputs);
12352
12353 // `instr` zdn, pg, zdn, zm
12354 ZRegister dn_result = z0.WithLaneSize(lane_size_in_bits);
12355 __ Mov(dn_result, zn);
Martyn Capewell37f28182020-01-14 10:15:10 +000012356 if (macro_nonan == NULL) {
12357 (masm.*macro)(dn_result, pg.Merging(), dn_result, zm, nan_option);
12358 } else {
12359 (masm.*macro_nonan)(dn_result, pg.Merging(), dn_result, zm);
12360 }
TatWai Chongd316c5e2019-10-16 12:22:10 -070012361
12362 // Based on whether zd and zm registers are aliased, the macro of instructions
12363 // (`Instr`) swaps the order of operands if it has the commutative property,
12364 // otherwise, transfer to the reversed `Instr`, such as fdivr.
12365 // `instr` zdm, pg, zn, zdm
12366 ZRegister dm_result = z1.WithLaneSize(lane_size_in_bits);
12367 __ Mov(dm_result, zm);
Martyn Capewell37f28182020-01-14 10:15:10 +000012368 if (macro_nonan == NULL) {
12369 (masm.*macro)(dm_result, pg.Merging(), zn, dm_result, nan_option);
12370 } else {
12371 (masm.*macro_nonan)(dm_result, pg.Merging(), zn, dm_result);
12372 }
TatWai Chongd316c5e2019-10-16 12:22:10 -070012373
12374 // The macro of instructions (`Instr`) automatically selects between `instr`
12375 // and movprfx + `instr` based on whether zd and zn registers are aliased.
12376 // A generated movprfx instruction is predicated that using the same
12377 // governing predicate register. In order to keep the result constant,
12378 // initialize the destination register first.
12379 // `instr` zd, pg, zn, zm
12380 ZRegister d_result = z2.WithLaneSize(lane_size_in_bits);
12381 __ Mov(d_result, zd);
Martyn Capewell37f28182020-01-14 10:15:10 +000012382 if (macro_nonan == NULL) {
12383 (masm.*macro)(d_result, pg.Merging(), zn, zm, nan_option);
12384 } else {
12385 (masm.*macro_nonan)(d_result, pg.Merging(), zn, zm);
12386 }
TatWai Chongd316c5e2019-10-16 12:22:10 -070012387
12388 END();
12389
12390 if (CAN_RUN()) {
12391 RUN();
12392
12393 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12394 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12395 if (!core.HasSVELane(dn_result, lane)) break;
12396 if ((pg_inputs[i] & 1) != 0) {
12397 ASSERT_EQUAL_SVE_LANE(zd_expected[i], dn_result, lane);
12398 } else {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012399 ASSERT_EQUAL_SVE_LANE(zn_inputs_rawbits[i], dn_result, lane);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012400 }
12401 }
12402
12403 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12404 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12405 if (!core.HasSVELane(dm_result, lane)) break;
12406 if ((pg_inputs[i] & 1) != 0) {
12407 ASSERT_EQUAL_SVE_LANE(zd_expected[i], dm_result, lane);
12408 } else {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012409 ASSERT_EQUAL_SVE_LANE(zm_inputs_rawbits[i], dm_result, lane);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012410 }
12411 }
12412
12413 ASSERT_EQUAL_SVE(zd_expected, d_result);
12414 }
12415}
12416
12417TEST_SVE(sve_binary_arithmetic_predicated_fdiv) {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012418 // The inputs are shared with different precision tests.
TatWai Chongd316c5e2019-10-16 12:22:10 -070012419 double zd_in[] = {0.1, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9};
12420
12421 double zn_in[] = {24.0,
12422 24.0,
12423 -2.0,
12424 -2.0,
12425 5.5,
12426 5.5,
12427 kFP64PositiveInfinity,
12428 kFP64PositiveInfinity,
12429 kFP64NegativeInfinity,
12430 kFP64NegativeInfinity};
12431
12432 double zm_in[] = {-2.0, -2.0, 24.0, 24.0, 0.5, 0.5, 0.65, 0.65, 24.0, 24.0};
12433
TatWai Chongd316c5e2019-10-16 12:22:10 -070012434 int pg_in[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
12435
TatWai Chong7a0d3672019-10-23 17:35:18 -070012436 uint16_t exp_h[] = {Float16ToRawbits(Float16(0.1)),
TatWai Chongd316c5e2019-10-16 12:22:10 -070012437 Float16ToRawbits(Float16(-12.0)),
12438 Float16ToRawbits(Float16(2.2)),
12439 Float16ToRawbits(Float16(-0.0833)),
12440 Float16ToRawbits(Float16(4.4)),
12441 Float16ToRawbits(Float16(11.0)),
12442 Float16ToRawbits(Float16(6.6)),
12443 Float16ToRawbits(kFP16PositiveInfinity),
12444 Float16ToRawbits(Float16(8.8)),
12445 Float16ToRawbits(kFP16NegativeInfinity)};
12446
TatWai Chong7a0d3672019-10-23 17:35:18 -070012447 FPBinArithHelper(config,
Martyn Capewell37f28182020-01-14 10:15:10 +000012448 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012449 &MacroAssembler::Fdiv,
12450 kHRegSize,
12451 zd_in,
12452 pg_in,
12453 zn_in,
12454 zm_in,
12455 exp_h);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012456
12457 uint32_t exp_s[] = {FloatToRawbits(0.1),
12458 FloatToRawbits(-12.0),
12459 FloatToRawbits(2.2),
12460 0xbdaaaaab,
12461 FloatToRawbits(4.4),
12462 FloatToRawbits(11.0),
12463 FloatToRawbits(6.6),
12464 FloatToRawbits(kFP32PositiveInfinity),
12465 FloatToRawbits(8.8),
12466 FloatToRawbits(kFP32NegativeInfinity)};
12467
TatWai Chong7a0d3672019-10-23 17:35:18 -070012468 FPBinArithHelper(config,
Martyn Capewell37f28182020-01-14 10:15:10 +000012469 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012470 &MacroAssembler::Fdiv,
12471 kSRegSize,
12472 zd_in,
12473 pg_in,
12474 zn_in,
12475 zm_in,
12476 exp_s);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012477
12478 uint64_t exp_d[] = {DoubleToRawbits(0.1),
12479 DoubleToRawbits(-12.0),
12480 DoubleToRawbits(2.2),
12481 0xbfb5555555555555,
12482 DoubleToRawbits(4.4),
12483 DoubleToRawbits(11.0),
12484 DoubleToRawbits(6.6),
12485 DoubleToRawbits(kFP64PositiveInfinity),
12486 DoubleToRawbits(8.8),
12487 DoubleToRawbits(kFP64NegativeInfinity)};
12488
TatWai Chong7a0d3672019-10-23 17:35:18 -070012489 FPBinArithHelper(config,
Martyn Capewell37f28182020-01-14 10:15:10 +000012490 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012491 &MacroAssembler::Fdiv,
12492 kDRegSize,
12493 zd_in,
12494 pg_in,
12495 zn_in,
12496 zm_in,
12497 exp_d);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012498}
12499
Martyn Capewell9cc3f142019-10-29 14:06:35 +000012500TEST_SVE(sve_select) {
12501 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12502 START();
12503
12504 uint64_t in0[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
12505 uint64_t in1[] = {0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa};
12506
12507 // For simplicity, we re-use the same pg for various lane sizes.
12508 // For D lanes: 1, 1, 0
12509 // For S lanes: 1, 1, 1, 0, 0
12510 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
12511 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
12512 Initialise(&masm, p0.VnB(), pg_in);
12513 PRegisterM pg = p0.Merging();
12514
12515 InsrHelper(&masm, z30.VnD(), in0);
12516 InsrHelper(&masm, z31.VnD(), in1);
12517
12518 __ Sel(z0.VnB(), pg, z30.VnB(), z31.VnB());
12519 __ Sel(z1.VnH(), pg, z30.VnH(), z31.VnH());
12520 __ Sel(z2.VnS(), pg, z30.VnS(), z31.VnS());
12521 __ Sel(z3.VnD(), pg, z30.VnD(), z31.VnD());
12522
12523 END();
12524
12525 if (CAN_RUN()) {
12526 RUN();
12527
12528 uint64_t expected_z0[] = {0xaaaaaaaa05aa07f8,
12529 0xfeaaaaf0aac3870f,
12530 0xaaaa56aa9abcdeaa};
12531 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
12532
12533 uint64_t expected_z1[] = {0xaaaaaaaaaaaa07f8,
12534 0xaaaaf8f0e1c3870f,
12535 0xaaaaaaaa9abcaaaa};
12536 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
12537
12538 uint64_t expected_z2[] = {0xaaaaaaaa05f607f8,
12539 0xfefcf8f0e1c3870f,
12540 0xaaaaaaaaaaaaaaaa};
12541 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
12542
12543 uint64_t expected_z3[] = {0x01f203f405f607f8,
12544 0xfefcf8f0e1c3870f,
12545 0xaaaaaaaaaaaaaaaa};
12546 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
12547 }
12548}
TatWai Chongd316c5e2019-10-16 12:22:10 -070012549
TatWai Chong7a0d3672019-10-23 17:35:18 -070012550TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h) {
12551 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12552 double zn_inputs[] = {-2.1,
12553 8.5,
12554 225.5,
12555 0.0,
12556 8.8,
12557 -4.75,
12558 kFP64PositiveInfinity,
12559 kFP64NegativeInfinity};
12560 double zm_inputs[] = {-2.0,
12561 -13.0,
12562 24.0,
12563 0.01,
12564 0.5,
12565 300.75,
12566 kFP64NegativeInfinity,
12567 kFP64PositiveInfinity};
12568 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12569
12570 uint16_t zd_expected_max[] = {Float16ToRawbits(Float16(-2.0)),
12571 Float16ToRawbits(Float16(8.5)),
12572 Float16ToRawbits(Float16(3.3)),
12573 Float16ToRawbits(Float16(0.01)),
12574 Float16ToRawbits(Float16(5.5)),
12575 Float16ToRawbits(Float16(300.75)),
12576 Float16ToRawbits(kFP16PositiveInfinity),
12577 Float16ToRawbits(kFP16PositiveInfinity)};
12578 FPBinArithHelper(config,
12579 &MacroAssembler::Fmax,
Martyn Capewell37f28182020-01-14 10:15:10 +000012580 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012581 kHRegSize,
12582 zd_inputs,
12583 pg_inputs,
12584 zn_inputs,
12585 zm_inputs,
12586 zd_expected_max);
12587
12588 uint16_t zd_expected_min[] = {Float16ToRawbits(Float16(-2.1)),
12589 Float16ToRawbits(Float16(-13.0)),
12590 Float16ToRawbits(Float16(3.3)),
12591 Float16ToRawbits(Float16(0.0)),
12592 Float16ToRawbits(Float16(5.5)),
12593 Float16ToRawbits(Float16(-4.75)),
12594 Float16ToRawbits(kFP16NegativeInfinity),
12595 Float16ToRawbits(kFP16NegativeInfinity)};
12596 FPBinArithHelper(config,
12597 &MacroAssembler::Fmin,
Martyn Capewell37f28182020-01-14 10:15:10 +000012598 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012599 kHRegSize,
12600 zd_inputs,
12601 pg_inputs,
12602 zn_inputs,
12603 zm_inputs,
12604 zd_expected_min);
12605}
12606
12607TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s) {
12608 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12609 double zn_inputs[] = {-2.1,
12610 8.5,
12611 225.5,
12612 0.0,
12613 8.8,
12614 -4.75,
12615 kFP64PositiveInfinity,
12616 kFP64NegativeInfinity};
12617 double zm_inputs[] = {-2.0,
12618 -13.0,
12619 24.0,
12620 0.01,
12621 0.5,
12622 300.75,
12623 kFP64NegativeInfinity,
12624 kFP64PositiveInfinity};
12625 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12626
12627 uint32_t zd_expected_max[] = {FloatToRawbits(-2.0),
12628 FloatToRawbits(8.5),
12629 FloatToRawbits(3.3),
12630 FloatToRawbits(0.01),
12631 FloatToRawbits(5.5),
12632 FloatToRawbits(300.75),
12633 FloatToRawbits(kFP32PositiveInfinity),
12634 FloatToRawbits(kFP32PositiveInfinity)};
12635 FPBinArithHelper(config,
12636 &MacroAssembler::Fmax,
Martyn Capewell37f28182020-01-14 10:15:10 +000012637 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012638 kSRegSize,
12639 zd_inputs,
12640 pg_inputs,
12641 zn_inputs,
12642 zm_inputs,
12643 zd_expected_max);
12644
12645 uint32_t zd_expected_min[] = {FloatToRawbits(-2.1),
12646 FloatToRawbits(-13.0),
12647 FloatToRawbits(3.3),
12648 FloatToRawbits(0.0),
12649 FloatToRawbits(5.5),
12650 FloatToRawbits(-4.75),
12651 FloatToRawbits(kFP32NegativeInfinity),
12652 FloatToRawbits(kFP32NegativeInfinity)};
12653 FPBinArithHelper(config,
12654 &MacroAssembler::Fmin,
Martyn Capewell37f28182020-01-14 10:15:10 +000012655 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012656 kSRegSize,
12657 zd_inputs,
12658 pg_inputs,
12659 zn_inputs,
12660 zm_inputs,
12661 zd_expected_min);
12662}
12663
12664TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d) {
12665 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12666 double zn_inputs[] = {-2.1,
12667 8.5,
12668 225.5,
12669 0.0,
12670 8.8,
12671 -4.75,
12672 kFP64PositiveInfinity,
12673 kFP64NegativeInfinity};
12674 double zm_inputs[] = {-2.0,
12675 -13.0,
12676 24.0,
12677 0.01,
12678 0.5,
12679 300.75,
12680 kFP64NegativeInfinity,
12681 kFP64PositiveInfinity};
12682 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12683
12684 uint64_t zd_expected_max[] = {DoubleToRawbits(-2.0),
12685 DoubleToRawbits(8.5),
12686 DoubleToRawbits(3.3),
12687 DoubleToRawbits(0.01),
12688 DoubleToRawbits(5.5),
12689 DoubleToRawbits(300.75),
12690 DoubleToRawbits(kFP64PositiveInfinity),
12691 DoubleToRawbits(kFP64PositiveInfinity)};
12692 FPBinArithHelper(config,
12693 &MacroAssembler::Fmax,
Martyn Capewell37f28182020-01-14 10:15:10 +000012694 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012695 kDRegSize,
12696 zd_inputs,
12697 pg_inputs,
12698 zn_inputs,
12699 zm_inputs,
12700 zd_expected_max);
12701
12702 uint64_t zd_expected_min[] = {DoubleToRawbits(-2.1),
12703 DoubleToRawbits(-13.0),
12704 DoubleToRawbits(3.3),
12705 DoubleToRawbits(0.0),
12706 DoubleToRawbits(5.5),
12707 DoubleToRawbits(-4.75),
12708 DoubleToRawbits(kFP64NegativeInfinity),
12709 DoubleToRawbits(kFP64NegativeInfinity)};
12710 FPBinArithHelper(config,
12711 &MacroAssembler::Fmin,
Martyn Capewell37f28182020-01-14 10:15:10 +000012712 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012713 kDRegSize,
12714 zd_inputs,
12715 pg_inputs,
12716 zn_inputs,
12717 zm_inputs,
12718 zd_expected_min);
12719}
TatWai Chong29a0c432019-11-06 22:20:44 -080012720
12721template <typename T, size_t N>
12722static void BitwiseShiftImmHelper(Test* config,
12723 int lane_size_in_bits,
12724 const T (&zn_inputs)[N],
12725 int shift) {
12726 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12727 START();
12728
12729 ZRegister zd_asr = z25.WithLaneSize(lane_size_in_bits);
12730 ZRegister zd_lsr = z26.WithLaneSize(lane_size_in_bits);
12731 ZRegister zd_lsl = z27.WithLaneSize(lane_size_in_bits);
12732 ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
12733
12734 InsrHelper(&masm, zn, zn_inputs);
12735
12736 __ Asr(zd_asr, zn, shift);
12737 __ Lsr(zd_lsr, zn, shift);
Martyn Capewell147b0ba2020-02-19 11:16:02 +000012738 __ Lsl(zd_lsl, zn, shift - 1); // Lsl supports 0 - lane_size-1.
TatWai Chong29a0c432019-11-06 22:20:44 -080012739
12740 END();
12741
12742 if (CAN_RUN()) {
12743 RUN();
12744
12745 const uint64_t mask = GetUintMask(lane_size_in_bits);
12746 for (int i = 0; i < static_cast<int>(N); i++) {
12747 int lane = N - i - 1;
12748 if (!core.HasSVELane(zd_asr, lane)) break;
12749 bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12750 uint64_t result;
12751 if (shift >= lane_size_in_bits) {
12752 result = is_negative ? mask : 0;
12753 } else {
12754 result = zn_inputs[i] >> shift;
12755 if (is_negative) {
12756 result |= mask << (lane_size_in_bits - shift);
12757 result &= mask;
12758 }
12759 }
12760 ASSERT_EQUAL_SVE_LANE(result, zd_asr, lane);
12761 }
12762
12763 for (int i = 0; i < static_cast<int>(N); i++) {
12764 int lane = N - i - 1;
12765 if (!core.HasSVELane(zd_lsr, lane)) break;
12766 uint64_t result =
12767 (shift >= lane_size_in_bits) ? 0 : zn_inputs[i] >> shift;
12768 ASSERT_EQUAL_SVE_LANE(result, zd_lsr, lane);
12769 }
12770
12771 for (int i = 0; i < static_cast<int>(N); i++) {
12772 int lane = N - i - 1;
12773 if (!core.HasSVELane(zd_lsl, lane)) break;
Jacob Bramley504d5e92020-05-21 11:40:21 +010012774 uint64_t result =
12775 (shift > lane_size_in_bits) ? 0 : zn_inputs[i] << (shift - 1);
TatWai Chong29a0c432019-11-06 22:20:44 -080012776 ASSERT_EQUAL_SVE_LANE(result & mask, zd_lsl, lane);
12777 }
12778 }
12779}
12780
12781TEST_SVE(sve_bitwise_shift_imm_unpredicated) {
12782 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12783 int shift_b[] = {1, 3, 5, 8};
12784 for (size_t i = 0; i < ArrayLength(shift_b); i++) {
12785 BitwiseShiftImmHelper(config, kBRegSize, inputs_b, shift_b[i]);
12786 }
12787
12788 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233};
12789 int shift_h[] = {1, 8, 11, 16};
12790 for (size_t i = 0; i < ArrayLength(shift_h); i++) {
12791 BitwiseShiftImmHelper(config, kHRegSize, inputs_h, shift_h[i]);
12792 }
12793
12794 uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233};
12795 int shift_s[] = {1, 9, 17, 32};
12796 for (size_t i = 0; i < ArrayLength(shift_s); i++) {
12797 BitwiseShiftImmHelper(config, kSRegSize, inputs_s, shift_s[i]);
12798 }
12799
12800 uint64_t inputs_d[] = {0xfedcba98fedcba98,
12801 0xfffa5555aaaaaaaa,
12802 0x0011223344aafe80};
12803 int shift_d[] = {1, 23, 45, 64};
12804 for (size_t i = 0; i < ArrayLength(shift_d); i++) {
12805 BitwiseShiftImmHelper(config, kDRegSize, inputs_d, shift_d[i]);
12806 }
12807}
12808
12809template <typename T, typename R, size_t N>
12810static void BitwiseShiftWideElementsHelper(Test* config,
12811 Shift shift_type,
12812 int lane_size_in_bits,
12813 const T (&zn_inputs)[N],
12814 const R& zm_inputs,
12815 const T (&zd_expected)[N]) {
12816 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12817 START();
12818
12819 ArithFn macro;
12820 // Since logical shift left and right by the current lane size width is equal
12821 // to 0, so initialize the array to 0 for convenience.
12822 uint64_t zd_expected_max_shift_amount[N] = {0};
12823 switch (shift_type) {
12824 case ASR: {
12825 macro = &MacroAssembler::Asr;
12826 uint64_t mask = GetUintMask(lane_size_in_bits);
12827 for (size_t i = 0; i < ArrayLength(zn_inputs); i++) {
12828 bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12829 zd_expected_max_shift_amount[i] = is_negative ? mask : 0;
12830 }
12831 break;
12832 }
12833 case LSR:
12834 macro = &MacroAssembler::Lsr;
12835 break;
12836 case LSL:
12837 macro = &MacroAssembler::Lsl;
12838 break;
12839 default:
12840 VIXL_UNIMPLEMENTED();
12841 macro = NULL;
12842 break;
12843 }
12844
12845 ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12846 ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12847 ZRegister zm = z28.WithLaneSize(kDRegSize);
12848
12849 InsrHelper(&masm, zn, zn_inputs);
12850 InsrHelper(&masm, zm, zm_inputs);
12851
12852 (masm.*macro)(zd, zn, zm);
12853
12854 ZRegister zm_max_shift_amount = z25.WithLaneSize(kDRegSize);
12855 ZRegister zd_max_shift_amount = z24.WithLaneSize(lane_size_in_bits);
12856
12857 __ Dup(zm_max_shift_amount, lane_size_in_bits);
12858 (masm.*macro)(zd_max_shift_amount, zn, zm_max_shift_amount);
12859
12860 ZRegister zm_out_of_range = z23.WithLaneSize(kDRegSize);
12861 ZRegister zd_out_of_range = z22.WithLaneSize(lane_size_in_bits);
12862
12863 __ Dup(zm_out_of_range, GetUintMask(lane_size_in_bits));
12864 (masm.*macro)(zd_out_of_range, zn, zm_out_of_range);
12865
12866 END();
12867
12868 if (CAN_RUN()) {
12869 RUN();
12870
12871 ASSERT_EQUAL_SVE(zd_expected, zd);
12872 ASSERT_EQUAL_SVE(zd_expected_max_shift_amount, zd_max_shift_amount);
12873 ASSERT_EQUAL_SVE(zd_max_shift_amount, zd_out_of_range);
12874 }
12875}
12876
12877TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr) {
12878 // clang-format off
12879 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12880 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12881 int shift_b[] = {1, 3};
12882 uint64_t expected_b[] = {0xff, 0xee, 0xdd, 0xcc, 0xff, 0x2a, 0xd5, 0xc0,
12883 0xff, 0xfb, 0xf7, 0xf3, 0xff, 0x0a, 0xf5, 0xf0};
12884 BitwiseShiftWideElementsHelper(config,
12885 ASR,
12886 kBRegSize,
12887 inputs_b,
12888 shift_b,
12889 expected_b);
12890
12891 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12892 0xfedc, 0xfa55, 0x0011, 0x2233,
12893 0xfedc, 0xfa55, 0x0011, 0x2233};
12894 int shift_h[] = {1, 8, 11};
12895 uint64_t expected_h[] = {0xff6e, 0xfd2a, 0x0008, 0x1119,
12896 0xfffe, 0xfffa, 0x0000, 0x0022,
12897 0xffff, 0xffff, 0x0000, 0x0004};
12898 BitwiseShiftWideElementsHelper(config,
12899 ASR,
12900 kHRegSize,
12901 inputs_h,
12902 shift_h,
12903 expected_h);
12904
12905 uint64_t inputs_s[] =
12906 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12907 int shift_s[] = {1, 9, 23};
12908 uint64_t expected_s[] =
12909 {0xff6e5d4c, 0xfffd2ad5, 0x00000891, 0x000091a2, 0xffffff55, 0xffffff11};
12910 BitwiseShiftWideElementsHelper(config,
12911 ASR,
12912 kSRegSize,
12913 inputs_s,
12914 shift_s,
12915 expected_s);
12916 // clang-format on
12917}
12918
12919TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr) {
12920 // clang-format off
12921 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12922 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12923 int shift_b[] = {1, 3};
12924 uint64_t expected_b[] = {0x7f, 0x6e, 0x5d, 0x4c, 0x7f, 0x2a, 0x55, 0x40,
12925 0x1f, 0x1b, 0x17, 0x13, 0x1f, 0x0a, 0x15, 0x10};
12926
12927 BitwiseShiftWideElementsHelper(config,
12928 LSR,
12929 kBRegSize,
12930 inputs_b,
12931 shift_b,
12932 expected_b);
12933
12934 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12935 0xfedc, 0xfa55, 0x0011, 0x2233,
12936 0xfedc, 0xfa55, 0x0011, 0x2233};
12937 int shift_h[] = {1, 8, 11};
12938 uint64_t expected_h[] = {0x7f6e, 0x7d2a, 0x0008, 0x1119,
12939 0x00fe, 0x00fa, 0x0000, 0x0022,
12940 0x001f, 0x001f, 0x0000, 0x0004};
12941 BitwiseShiftWideElementsHelper(config,
12942 LSR,
12943 kHRegSize,
12944 inputs_h,
12945 shift_h,
12946 expected_h);
12947
12948 uint64_t inputs_s[] =
12949 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12950 int shift_s[] = {1, 9, 23};
12951 uint64_t expected_s[] =
12952 {0x7f6e5d4c, 0x7ffd2ad5, 0x00000891, 0x000091a2, 0x00000155, 0x00000111};
12953 BitwiseShiftWideElementsHelper(config,
12954 LSR,
12955 kSRegSize,
12956 inputs_s,
12957 shift_s,
12958 expected_s);
12959 // clang-format on
12960}
12961
12962TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl) {
12963 // clang-format off
12964 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12965 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12966 int shift_b[] = {1, 5};
12967
12968 uint64_t expected_b[] = {0xfc, 0xb8, 0x74, 0x30, 0xfe, 0xaa, 0x54, 0x00,
12969 0xc0, 0x80, 0x40, 0x00, 0xe0, 0xa0, 0x40, 0x00};
12970
12971 BitwiseShiftWideElementsHelper(config,
12972 LSL,
12973 kBRegSize,
12974 inputs_b,
12975 shift_b,
12976 expected_b);
12977 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12978 0xfedc, 0xfa55, 0x0011, 0x2233,
12979 0xfedc, 0xfa55, 0x0011, 0x2233};
12980 int shift_h[] = {1, 2, 14};
12981
12982 uint64_t expected_h[] = {0xfdb8, 0xf4aa, 0x0022, 0x4466,
12983 0xfb70, 0xe954, 0x0044, 0x88cc,
12984 0x0000, 0x4000, 0x4000, 0xc000};
12985 BitwiseShiftWideElementsHelper(config,
12986 LSL,
12987 kHRegSize,
12988 inputs_h,
12989 shift_h,
12990 expected_h);
12991 uint64_t inputs_s[] =
12992 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12993 int shift_s[] = {1, 19, 26};
12994 uint64_t expected_s[] =
12995 {0xfdb97530, 0xfff4ab54, 0x11980000, 0x2b380000, 0xa8000000, 0x20000000};
12996 BitwiseShiftWideElementsHelper(config,
12997 LSL,
12998 kSRegSize,
12999 inputs_s,
13000 shift_s,
13001 expected_s);
Martyn Capewell3bf2d162020-02-17 15:04:36 +000013002
13003 // Test large shifts outside the range of the "unsigned" type.
13004 uint64_t inputs_b2[] = {1, 2, 4, 8, 3, 5, 7, 9,
13005 1, 2, 4, 8, 3, 5, 7, 9};
13006 uint64_t shift_b2[] = {1, 0x1000000001};
13007 uint64_t expected_b2[] = {2, 4, 8, 16, 6, 10, 14, 18,
13008 0, 0, 0, 0, 0, 0, 0, 0};
13009 BitwiseShiftWideElementsHelper(config, LSL, kBRegSize, inputs_b2, shift_b2,
13010 expected_b2);
13011
TatWai Chong29a0c432019-11-06 22:20:44 -080013012 // clang-format on
13013}
13014
Martyn Capewell76c094a2020-02-13 17:26:49 +000013015TEST_SVE(sve_shift_by_vector) {
13016 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13017
13018 START();
13019 __ Ptrue(p0.VnB());
13020 __ Pfalse(p1.VnB());
13021 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13022 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13023 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13024 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13025
13026 __ Dup(z31.VnD(), 0x8000000080008080);
13027 __ Dup(z0.VnB(), -1);
13028
13029 __ Index(z1.VnB(), 0, 1);
13030 __ Dup(z2.VnB(), 0x55);
13031 __ Lsr(z2.VnB(), p2.Merging(), z0.VnB(), z1.VnB());
13032 __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnB());
13033 __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnB());
13034
13035 __ Index(z1.VnH(), 0, 1);
13036 __ Dup(z6.VnB(), 0x55);
13037 __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnH());
13038 __ Lsl(z6.VnH(), p3.Merging(), z0.VnH(), z1.VnH());
13039 __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnH());
13040
13041 __ Index(z1.VnS(), 0, 1);
13042 __ Dup(z10.VnB(), 0x55);
13043 __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
13044 __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
13045 __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnS());
13046
13047 __ Index(z1.VnD(), 0, 1);
13048 __ Lsr(z0.VnD(), p5.Merging(), z0.VnD(), z1.VnD());
13049 __ Lsl(z12.VnD(), p0.Merging(), z0.VnD(), z1.VnD());
13050 __ Asr(z13.VnD(), p0.Merging(), z31.VnD(), z1.VnD());
13051
13052 __ Dup(z11.VnD(), 0x100000001);
13053 __ Lsl(z14.VnD(), p0.Merging(), z1.VnD(), z11.VnD());
13054
13055 __ Index(z0.VnH(), 7, -1);
13056 __ Lsr(z0.VnH(), p0.Merging(), z31.VnH(), z0.VnH());
13057 END();
13058
13059 if (CAN_RUN()) {
13060 RUN();
13061
13062 uint64_t expected_z0[] = {0x8000000020001010, 0x0800000002000101};
13063 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13064 uint64_t expected_z2[] = {0x5500550055005500, 0x5503550f553f55ff};
13065 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13066 uint64_t expected_z3[] = {0x0000000000000000, 0x80c0e0f0f8fcfeff};
13067 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13068 uint64_t expected_z4[] = {0xff000000ff00ffff, 0xff000000f000c080};
13069 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13070 uint64_t expected_z5[] = {0x01ff03ff07ff0fff, 0x1fff3fff7fffffff};
13071 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13072 uint64_t expected_z6[] = {0x5555ffc05555fff0, 0x5555fffc5555ffff};
13073 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13074 uint64_t expected_z7[] = {0xff000000fc00f808, 0xf0000000c0008080};
13075 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13076 uint64_t expected_z8[] = {0x1fffffff3fffffff, 0x7fffffffffffffff};
13077 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13078 uint64_t expected_z9[] = {0xfffffff8fffffffc, 0xfffffffeffffffff};
13079 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13080 uint64_t expected_z10[] = {0x55555555e0002020, 0x5555555580008080};
13081 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13082 uint64_t expected_z12[] = {0xfffffffffffffffe, 0xffffffffffffffff};
13083 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13084 uint64_t expected_z13[] = {0xc000000040004040, 0x8000000080008080};
13085 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13086 uint64_t expected_z14[] = {0, 0};
13087 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13088 }
13089}
13090
13091TEST_SVE(sve_shift_by_wide_vector) {
13092 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13093
13094 START();
13095 __ Ptrue(p0.VnB());
13096 __ Pfalse(p1.VnB());
13097 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13098 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13099 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13100
13101 __ Dup(z31.VnD(), 0x8000000080008080);
13102 __ Dup(z0.VnB(), -1);
13103 __ Index(z1.VnD(), 1, 5);
13104
13105 __ Dup(z2.VnB(), 0x55);
13106 __ Lsr(z2.VnB(), p2.Merging(), z2.VnB(), z1.VnD());
13107 __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnD());
13108 __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnD());
13109
13110 __ Dup(z6.VnB(), 0x55);
13111 __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnD());
13112 __ Lsl(z6.VnH(), p3.Merging(), z6.VnH(), z1.VnD());
13113 __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnD());
13114
13115 __ Dup(z10.VnB(), 0x55);
13116 __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13117 __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13118 __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnD());
13119 END();
13120
13121 if (CAN_RUN()) {
13122 RUN();
13123
13124 uint64_t expected_z2[] = {0x5501550155015501, 0x552a552a552a552a};
13125 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13126 uint64_t expected_z3[] = {0xc0c0c0c0c0c0c0c0, 0xfefefefefefefefe};
13127 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13128 uint64_t expected_z4[] = {0xfe000000fe00fefe, 0xc0000000c000c0c0};
13129 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13130 uint64_t expected_z5[] = {0x03ff03ff03ff03ff, 0x7fff7fff7fff7fff};
13131 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13132 uint64_t expected_z6[] = {0x5555554055555540, 0x5555aaaa5555aaaa};
13133 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13134 uint64_t expected_z7[] = {0xfe000000fe00fe02, 0xc0000000c000c040};
13135 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13136 uint64_t expected_z8[] = {0x03ffffff03ffffff, 0x7fffffff7fffffff};
13137 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13138 uint64_t expected_z9[] = {0xffffffc0ffffffc0, 0xfffffffefffffffe};
13139 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13140 uint64_t expected_z10[] = {0x55555555fe000202, 0x55555555c0004040};
13141 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13142 }
13143}
13144
Martyn Capewell83e86612020-02-19 15:46:15 +000013145TEST_SVE(sve_pred_shift_imm) {
13146 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13147
13148 START();
13149 __ Ptrue(p0.VnB());
13150 __ Pfalse(p1.VnB());
13151 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13152 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13153 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13154 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13155
13156 __ Dup(z31.VnD(), 0x8000000080008080);
13157 __ Lsr(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13158 __ Mov(z1, z0);
13159 __ Lsl(z1.VnB(), p2.Merging(), z1.VnB(), 1);
13160 __ Asr(z2.VnB(), p0.Merging(), z1.VnB(), 2);
13161
13162 __ Lsr(z3.VnH(), p0.Merging(), z31.VnH(), 2);
13163 __ Mov(z4, z3);
13164 __ Lsl(z4.VnH(), p3.Merging(), z4.VnH(), 2);
13165 __ Asr(z5.VnH(), p0.Merging(), z4.VnH(), 3);
13166
13167 __ Lsr(z6.VnS(), p0.Merging(), z31.VnS(), 3);
13168 __ Mov(z7, z6);
13169 __ Lsl(z7.VnS(), p4.Merging(), z7.VnS(), 3);
13170 __ Asr(z8.VnS(), p0.Merging(), z7.VnS(), 4);
13171
13172 __ Lsr(z9.VnD(), p0.Merging(), z31.VnD(), 4);
13173 __ Mov(z10, z9);
13174 __ Lsl(z10.VnD(), p5.Merging(), z10.VnD(), 4);
13175 __ Asr(z11.VnD(), p0.Merging(), z10.VnD(), 5);
13176 END();
13177
13178 if (CAN_RUN()) {
13179 RUN();
13180 uint64_t expected_z0[] = {0x4000000040004040, 0x4000000040004040};
13181 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13182 uint64_t expected_z1[] = {0x4000000040004080, 0x4000000040004080};
13183 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13184 uint64_t expected_z2[] = {0x10000000100010e0, 0x10000000100010e0};
13185 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13186 uint64_t expected_z3[] = {0x2000000020002020, 0x2000000020002020};
13187 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13188 uint64_t expected_z4[] = {0x2000000020008080, 0x2000000020008080};
13189 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13190 uint64_t expected_z5[] = {0x040000000400f010, 0x040000000400f010};
13191 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13192 uint64_t expected_z6[] = {0x1000000010001010, 0x1000000010001010};
13193 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13194 uint64_t expected_z7[] = {0x1000000080008080, 0x1000000080008080};
13195 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13196 uint64_t expected_z8[] = {0x01000000f8000808, 0x01000000f8000808};
13197 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13198 uint64_t expected_z9[] = {0x0800000008000808, 0x0800000008000808};
13199 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13200 uint64_t expected_z10[] = {0x0800000008000808, 0x8000000080008080};
13201 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13202 uint64_t expected_z11[] = {0x0040000000400040, 0xfc00000004000404};
13203 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13204 }
13205}
13206
13207TEST_SVE(sve_asrd) {
13208 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13209
13210 START();
13211 __ Ptrue(p0.VnB());
13212 __ Pfalse(p1.VnB());
13213 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13214 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13215 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13216 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13217
13218 __ Index(z31.VnB(), 0x7f - 3, 1);
13219 __ Asrd(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13220 __ Mov(z1, z31);
13221 __ Asrd(z1.VnB(), p2.Merging(), z1.VnB(), 2);
13222 __ Asrd(z2.VnB(), p0.Merging(), z31.VnB(), 7);
13223 __ Asrd(z3.VnB(), p0.Merging(), z31.VnB(), 8);
13224
13225 __ Index(z31.VnH(), 0x7fff - 3, 1);
13226 __ Asrd(z4.VnH(), p0.Merging(), z31.VnH(), 1);
13227 __ Mov(z5, z31);
13228 __ Asrd(z5.VnH(), p3.Merging(), z5.VnH(), 2);
13229 __ Asrd(z6.VnH(), p0.Merging(), z31.VnH(), 15);
13230 __ Asrd(z7.VnH(), p0.Merging(), z31.VnH(), 16);
13231
13232 __ Index(z31.VnS(), 0x7fffffff - 1, 1);
13233 __ Asrd(z8.VnS(), p0.Merging(), z31.VnS(), 1);
13234 __ Mov(z9, z31);
13235 __ Asrd(z9.VnS(), p4.Merging(), z9.VnS(), 2);
13236 __ Asrd(z10.VnS(), p0.Merging(), z31.VnS(), 31);
13237 __ Asrd(z11.VnS(), p0.Merging(), z31.VnS(), 32);
13238
13239 __ Index(z31.VnD(), 0x7fffffffffffffff, 1);
13240 __ Asrd(z12.VnD(), p0.Merging(), z31.VnD(), 1);
13241 __ Mov(z13, z31);
13242 __ Asrd(z13.VnD(), p5.Merging(), z13.VnD(), 2);
13243 __ Asrd(z14.VnD(), p0.Merging(), z31.VnD(), 63);
13244 __ Asrd(z31.VnD(), p0.Merging(), z31.VnD(), 64);
13245 END();
13246
13247 if (CAN_RUN()) {
13248 RUN();
13249 uint64_t expected_z0[] = {0xc6c5c5c4c4c3c3c2, 0xc2c1c1c03f3f3e3e};
13250 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13251 uint64_t expected_z1[] = {0x8be389e287e285e1, 0x83e181e07f1f7d1f};
13252 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13253 uint64_t expected_z2[] = {0x0000000000000000, 0x000000ff00000000};
13254 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13255 uint64_t expected_z3[] = {0x0000000000000000, 0x0000000000000000};
13256 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13257 uint64_t expected_z4[] = {0xc002c001c001c000, 0x3fff3fff3ffe3ffe};
13258 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13259 uint64_t expected_z5[] = {0x8003e0018001e000, 0x7fff1fff7ffd1fff};
13260 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13261 uint64_t expected_z6[] = {0x000000000000ffff, 0x0000000000000000};
13262 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13263 uint64_t expected_z7[] = {0x0000000000000000, 0x0000000000000000};
13264 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13265 uint64_t expected_z8[] = {0xc0000001c0000000, 0x3fffffff3fffffff};
13266 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13267 uint64_t expected_z9[] = {0x80000001e0000000, 0x7fffffff1fffffff};
13268 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13269 uint64_t expected_z10[] = {0x00000000ffffffff, 0x0000000000000000};
13270 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13271 uint64_t expected_z11[] = {0x0000000000000000, 0x0000000000000000};
13272 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13273 uint64_t expected_z12[] = {0xc000000000000000, 0x3fffffffffffffff};
13274 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13275 uint64_t expected_z13[] = {0x8000000000000000, 0x1fffffffffffffff};
13276 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13277 uint64_t expected_z14[] = {0xffffffffffffffff, 0x0000000000000000};
13278 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13279 uint64_t expected_z31[] = {0x0000000000000000, 0x0000000000000000};
13280 ASSERT_EQUAL_SVE(expected_z31, z31.VnD());
13281 }
13282}
13283
TatWai Chong4023d7a2019-11-18 14:16:28 -080013284TEST_SVE(sve_setffr) {
13285 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13286 START();
13287
13288 __ Ptrue(p15.VnB());
13289 __ Setffr();
13290 __ Rdffr(p14.VnB());
13291
13292 END();
13293
13294 if (CAN_RUN()) {
13295 RUN();
13296
13297 ASSERT_EQUAL_SVE(p14.VnB(), p15.VnB());
13298 }
13299}
13300
13301static void WrffrHelper(Test* config, unsigned active_lanes) {
13302 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13303 START();
13304
13305 int inputs[kPRegMaxSize] = {0};
13306 VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13307 for (unsigned i = 0; i < active_lanes; i++) {
13308 // The rightmost (highest-indexed) array element maps to the lowest-numbered
13309 // lane.
13310 inputs[kPRegMaxSize - i - 1] = 1;
13311 }
13312
13313 Initialise(&masm, p1.VnB(), inputs);
13314 __ Wrffr(p1.VnB());
13315 __ Rdffr(p2.VnB());
13316
13317 END();
13318
13319 if (CAN_RUN()) {
13320 RUN();
13321
13322 ASSERT_EQUAL_SVE(p1.VnB(), p2.VnB());
13323 }
13324}
13325
13326TEST_SVE(sve_wrffr) {
13327 int active_lanes_inputs[] = {0, 1, 7, 10, 32, 48, kPRegMaxSize};
13328 for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13329 WrffrHelper(config, active_lanes_inputs[i]);
13330 }
13331}
13332
TatWai Chonga3e8b172019-11-22 21:48:56 -080013333template <size_t N>
13334static void RdffrHelper(Test* config,
13335 size_t active_lanes,
13336 const int (&pg_inputs)[N]) {
13337 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13338 START();
13339
13340 VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13341
13342 // The rightmost (highest-indexed) array element maps to the lowest-numbered
13343 // lane.
13344 int pd[kPRegMaxSize] = {0};
13345 for (unsigned i = 0; i < active_lanes; i++) {
13346 pd[kPRegMaxSize - i - 1] = 1;
13347 }
13348
13349 int pg[kPRegMaxSize] = {0};
13350 for (unsigned i = 0; i < N; i++) {
13351 pg[kPRegMaxSize - i - 1] = pg_inputs[i];
13352 }
13353
13354 int pd_expected[kPRegMaxSize] = {0};
13355 for (unsigned i = 0; i < std::min(active_lanes, N); i++) {
13356 int lane = kPRegMaxSize - i - 1;
13357 pd_expected[lane] = pd[lane] & pg[lane];
13358 }
13359
13360 Initialise(&masm, p0.VnB(), pg);
13361 Initialise(&masm, p1.VnB(), pd);
13362
13363 // The unpredicated form of rdffr has been tested in `WrffrHelper`.
13364 __ Wrffr(p1.VnB());
13365 __ Rdffr(p14.VnB(), p0.Zeroing());
13366 __ Rdffrs(p13.VnB(), p0.Zeroing());
13367 __ Mrs(x8, NZCV);
13368
13369 END();
13370
13371 if (CAN_RUN()) {
13372 RUN();
13373
13374 ASSERT_EQUAL_SVE(pd_expected, p14.VnB());
13375 ASSERT_EQUAL_SVE(pd_expected, p13.VnB());
13376 StatusFlags nzcv_expected =
13377 GetPredTestFlags(pd_expected, pg, core.GetSVELaneCount(kBRegSize));
13378 ASSERT_EQUAL_64(nzcv_expected, x8);
13379 }
13380}
13381
13382TEST_SVE(sve_rdffr_rdffrs) {
13383 // clang-format off
13384 int active_lanes_inputs[] = {0, 1, 15, 26, 39, 47, kPRegMaxSize};
13385 int pg_inputs_0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13386 int pg_inputs_1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13387 int pg_inputs_2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13388 int pg_inputs_3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
13389 int pg_inputs_4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13390 // clang-format on
13391
13392 for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13393 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_0);
13394 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_1);
13395 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_2);
13396 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_3);
13397 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_4);
13398 }
13399}
13400
TatWai Chong38303d92019-12-02 15:49:29 -080013401typedef void (MacroAssembler::*BrkpFn)(const PRegisterWithLaneSize& pd,
13402 const PRegisterZ& pg,
13403 const PRegisterWithLaneSize& pn,
13404 const PRegisterWithLaneSize& pm);
13405
13406template <typename Tg, typename Tn, typename Td>
13407static void BrkpaBrkpbHelper(Test* config,
13408 BrkpFn macro,
13409 BrkpFn macro_set_flags,
13410 const Tg& pg_inputs,
13411 const Tn& pn_inputs,
13412 const Tn& pm_inputs,
13413 const Td& pd_expected) {
13414 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13415 START();
13416
13417 PRegister pg = p15;
13418 PRegister pn = p14;
13419 PRegister pm = p13;
13420 Initialise(&masm, pg.VnB(), pg_inputs);
13421 Initialise(&masm, pn.VnB(), pn_inputs);
13422 Initialise(&masm, pm.VnB(), pm_inputs);
13423
13424 // Initialise NZCV to an impossible value, to check that we actually write it.
13425 __ Mov(x10, NZCVFlag);
13426 __ Msr(NZCV, x10);
13427
13428 (masm.*macro_set_flags)(p0.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13429 __ Mrs(x0, NZCV);
13430
13431 (masm.*macro)(p1.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13432
13433 END();
13434
13435 if (CAN_RUN()) {
13436 RUN();
13437
13438 ASSERT_EQUAL_SVE(pd_expected, p0.VnB());
13439
13440 // Check that the flags were properly set.
13441 StatusFlags nzcv_expected =
13442 GetPredTestFlags(pd_expected,
13443 pg_inputs,
13444 core.GetSVELaneCount(kBRegSize));
13445 ASSERT_EQUAL_64(nzcv_expected, x0);
13446 ASSERT_EQUAL_SVE(p0.VnB(), p1.VnB());
13447 }
13448}
13449
13450template <typename Tg, typename Tn, typename Td>
13451static void BrkpaHelper(Test* config,
13452 const Tg& pg_inputs,
13453 const Tn& pn_inputs,
13454 const Tn& pm_inputs,
13455 const Td& pd_expected) {
13456 BrkpaBrkpbHelper(config,
13457 &MacroAssembler::Brkpa,
13458 &MacroAssembler::Brkpas,
13459 pg_inputs,
13460 pn_inputs,
13461 pm_inputs,
13462 pd_expected);
13463}
13464
13465template <typename Tg, typename Tn, typename Td>
13466static void BrkpbHelper(Test* config,
13467 const Tg& pg_inputs,
13468 const Tn& pn_inputs,
13469 const Tn& pm_inputs,
13470 const Td& pd_expected) {
13471 BrkpaBrkpbHelper(config,
13472 &MacroAssembler::Brkpb,
13473 &MacroAssembler::Brkpbs,
13474 pg_inputs,
13475 pn_inputs,
13476 pm_inputs,
13477 pd_expected);
13478}
13479
13480TEST_SVE(sve_brkpb) {
13481 // clang-format off
13482 // The last active element of `pn` are `true` in all vector length configurations.
13483 // | boundary of 128-bits VL.
13484 // v
13485 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13486 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13487 int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13488
13489 // | highest-numbered lane lowest-numbered lane |
13490 // v v
13491 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13492 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13493 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13494
13495 int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13496 int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13497 int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13498
13499 // | first active
13500 // v
13501 int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13502 // | first active
13503 // v
13504 int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13505 // | first active
13506 // v
13507 int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13508
13509 BrkpbHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13510 BrkpbHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13511 BrkpbHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13512
13513 // | first active
13514 // v
13515 int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13516 // | first active
13517 // v
13518 int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13519 // | first active
13520 // v
13521 int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
13522 BrkpbHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13523 BrkpbHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13524 BrkpbHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13525
13526 // | first active
13527 // v
13528 int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13529 // | first active
13530 // v
13531 int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13532 // | first active
13533 // v
13534 int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13535 BrkpbHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13536 BrkpbHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13537 BrkpbHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13538
13539 // The last active element of `pn` are `false` in all vector length configurations.
13540 // | last active lane when VL > 128 bits.
13541 // v
13542 // | last active lane when VL == 128 bits.
13543 // v
13544 int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13545 int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13546 BrkpbHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13547 BrkpbHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13548 BrkpbHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13549 // clang-format on
13550}
13551
13552TEST_SVE(sve_brkpa) {
13553 // clang-format off
13554 // The last active element of `pn` are `true` in all vector length configurations.
13555 // | boundary of 128-bits VL.
13556 // v
13557 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13558 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13559 int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13560
13561 // | highest-numbered lane lowest-numbered lane |
13562 // v v
13563 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13564 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13565 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13566
13567 int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13568 int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13569 int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13570
13571 // | first active
13572 // v
13573 int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13574 // | first active
13575 // v
13576 int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13577 // | first active
13578 // v
13579 int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13580
13581 BrkpaHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13582 BrkpaHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13583 BrkpaHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13584
13585 // | first active
13586 // v
13587 int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13588 // | first active
13589 // v
13590 int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13591 // | first active
13592 // v
13593 int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
13594 BrkpaHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13595 BrkpaHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13596 BrkpaHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13597
13598 // | first active
13599 // v
13600 int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13601 // | first active
13602 // v
13603 int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13604 // | first active
13605 // v
13606 int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13607 BrkpaHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13608 BrkpaHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13609 BrkpaHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13610
13611 // The last active element of `pn` are `false` in all vector length configurations.
13612 // | last active lane when VL > 128 bits.
13613 // v
13614 // | last active lane when VL == 128 bits.
13615 // v
13616 int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13617 int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13618 BrkpaHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13619 BrkpaHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13620 BrkpaHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13621 // clang-format on
13622}
13623
Martyn Capewell77b6d982019-12-02 18:34:59 +000013624TEST_SVE(sve_rbit) {
13625 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13626 START();
13627
13628 uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13629 InsrHelper(&masm, z0.VnD(), inputs);
13630
13631 __ Ptrue(p1.VnB());
13632 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13633 Initialise(&masm, p2.VnB(), pred);
13634
13635 __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13636 __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13637
13638 __ Rbit(z1.VnB(), p1.Merging(), z0.VnB());
13639 __ Rbit(z2.VnH(), p1.Merging(), z0.VnH());
13640 __ Rbit(z3.VnS(), p1.Merging(), z0.VnS());
13641 __ Rbit(z4.VnD(), p1.Merging(), z0.VnD());
13642
13643 __ Dup(z5.VnB(), 0x42);
13644 __ Rbit(z5.VnB(), p2.Merging(), z0.VnB());
13645 __ Dup(z6.VnB(), 0x42);
13646 __ Rbit(z6.VnS(), p2.Merging(), z0.VnS());
13647
13648 END();
13649
13650 if (CAN_RUN()) {
13651 RUN();
13652
13653 ASSERT_EQUAL_SVE(inputs, z0.VnD());
13654
13655 uint64_t expected_z1[] = {0x55555555aaaaaaaa, 0x5555aaaa55aa55aa};
13656 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13657 uint64_t expected_z2[] = {0x55555555aaaaaaaa, 0x5555aaaaaa55aa55};
13658 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13659 uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0xaaaa5555aa55aa55};
13660 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13661 uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0xaa55aa55aaaa5555};
13662 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13663 uint64_t expected_z5[] = {0x4255425542aa42aa, 0x4255424242aa42aa};
13664 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13665 uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0x42424242aa55aa55};
13666 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13667 }
13668}
13669
13670TEST_SVE(sve_rev_bhw) {
13671 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13672 START();
13673
13674 uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13675 InsrHelper(&masm, z0.VnD(), inputs);
13676
13677 __ Ptrue(p1.VnB());
13678 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13679 Initialise(&masm, p2.VnB(), pred);
13680
13681 __ Revb(z1.VnH(), p1.Merging(), z0.VnH());
13682 __ Revb(z2.VnS(), p1.Merging(), z0.VnS());
13683 __ Revb(z3.VnD(), p1.Merging(), z0.VnD());
13684 __ Revh(z4.VnS(), p1.Merging(), z0.VnS());
13685 __ Revh(z5.VnD(), p1.Merging(), z0.VnD());
13686 __ Revw(z6.VnD(), p1.Merging(), z0.VnD());
13687
13688 __ Dup(z7.VnB(), 0x42);
13689 __ Revb(z7.VnH(), p2.Merging(), z0.VnH());
13690 __ Dup(z8.VnB(), 0x42);
13691 __ Revh(z8.VnS(), p2.Merging(), z0.VnS());
13692
13693 END();
13694
13695 if (CAN_RUN()) {
13696 RUN();
13697
13698 uint64_t expected_z1[] = {0xaaaaaaaa55555555, 0xaaaa555555aa55aa};
13699 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13700 uint64_t expected_z2[] = {0xaaaaaaaa55555555, 0x5555aaaa55aa55aa};
13701 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13702 uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0x55aa55aa5555aaaa};
13703 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13704 uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0x5555aaaaaa55aa55};
13705 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13706 uint64_t expected_z5[] = {0x55555555aaaaaaaa, 0xaa55aa555555aaaa};
13707 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13708 uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0xaa55aa55aaaa5555};
13709 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13710 uint64_t expected_z7[] = {0xaaaaaaaa55555555, 0xaaaa424255aa55aa};
13711 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13712 uint64_t expected_z8[] = {0xaaaaaaaa55555555, 0x42424242aa55aa55};
13713 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13714 }
13715}
13716
Martyn Capewell43782632019-12-12 13:22:10 +000013717TEST_SVE(sve_ftssel) {
13718 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13719 START();
13720
13721 uint64_t in[] = {0x1111777766665555, 0xaaaabbbbccccdddd};
13722 uint64_t q[] = {0x0001000300000002, 0x0001000200000003};
13723 InsrHelper(&masm, z0.VnD(), in);
13724 InsrHelper(&masm, z1.VnD(), q);
13725
13726 __ Ftssel(z2.VnH(), z0.VnH(), z1.VnH());
13727 __ Ftssel(z3.VnS(), z0.VnS(), z1.VnS());
13728 __ Ftssel(z4.VnD(), z0.VnD(), z1.VnD());
13729
13730 END();
13731
13732 if (CAN_RUN()) {
13733 RUN();
13734
13735 uint64_t expected_z2[] = {0x3c00bc006666d555, 0x3c003bbbccccbc00};
13736 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13737 uint64_t expected_z3[] = {0xbf800000e6665555, 0x2aaabbbbbf800000};
13738 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13739 uint64_t expected_z4[] = {0x9111777766665555, 0xbff0000000000000};
13740 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13741 }
13742}
13743
13744TEST_SVE(sve_fexpa) {
13745 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13746 START();
13747
13748 uint64_t in0[] = {0x3ff0000000000000, 0x3ff0000000011001};
13749 uint64_t in1[] = {0x3ff000000002200f, 0xbff000000003301f};
13750 uint64_t in2[] = {0xbff000000004403f, 0x3ff0000000055040};
13751 uint64_t in3[] = {0x3f800000bf800001, 0x3f80000f3f80001f};
13752 uint64_t in4[] = {0x3f80002f3f82203f, 0xbf8000403f833041};
13753 uint64_t in5[] = {0x3c003c01bc00bc07, 0x3c08bc0f3c1fbc20};
13754 InsrHelper(&masm, z0.VnD(), in0);
13755 InsrHelper(&masm, z1.VnD(), in1);
13756 InsrHelper(&masm, z2.VnD(), in2);
13757 InsrHelper(&masm, z3.VnD(), in3);
13758 InsrHelper(&masm, z4.VnD(), in4);
13759 InsrHelper(&masm, z5.VnD(), in5);
13760
13761 __ Fexpa(z6.VnD(), z0.VnD());
13762 __ Fexpa(z7.VnD(), z1.VnD());
13763 __ Fexpa(z8.VnD(), z2.VnD());
13764 __ Fexpa(z9.VnS(), z3.VnS());
13765 __ Fexpa(z10.VnS(), z4.VnS());
13766 __ Fexpa(z11.VnH(), z5.VnH());
13767
13768 END();
13769
13770 if (CAN_RUN()) {
13771 RUN();
13772 uint64_t expected_z6[] = {0x0000000000000000, 0x44002c9a3e778061};
13773 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13774 uint64_t expected_z7[] = {0x0802d285a6e4030b, 0x4c06623882552225};
13775 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13776 uint64_t expected_z8[] = {0x100fa7c1819e90d8, 0x5410000000000000};
13777 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13778 uint64_t expected_z9[] = {0x00000000000164d2, 0x0016942d003311c4};
13779 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13780 uint64_t expected_z10[] = {0x0054f35b407d3e0c, 0x00800000608164d2};
13781 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13782 uint64_t expected_z11[] = {0x00000016000000a8, 0x00c2018903d40400};
13783 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13784 }
13785}
13786
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000013787TEST_SVE(sve_rev_p) {
13788 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13789 START();
13790
13791 Initialise(&masm,
13792 p0.VnB(),
13793 0xabcdabcdabcdabcd,
13794 0xabcdabcdabcdabcd,
13795 0xabcdabcdabcdabcd,
13796 0xabcdabcdabcdabcd);
13797
13798 __ Rev(p1.VnB(), p0.VnB());
13799 __ Rev(p2.VnH(), p0.VnH());
13800 __ Rev(p3.VnS(), p0.VnS());
13801 __ Rev(p4.VnD(), p0.VnD());
13802
13803 END();
13804
13805 if (CAN_RUN()) {
13806 RUN();
13807
13808 int p1_expected[] = {1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1};
13809 ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
13810 int p2_expected[] = {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0};
13811 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13812 int p3_expected[] = {1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0};
13813 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13814 int p4_expected[] = {1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1};
13815 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13816 }
13817}
13818
13819TEST_SVE(sve_trn_p_bh) {
13820 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13821 START();
13822
13823 Initialise(&masm, p0.VnB(), 0xa5a55a5a);
13824 __ Pfalse(p1.VnB());
13825
13826 __ Trn1(p2.VnB(), p0.VnB(), p0.VnB());
13827 __ Trn2(p3.VnB(), p0.VnB(), p0.VnB());
13828 __ Trn1(p4.VnB(), p1.VnB(), p0.VnB());
13829 __ Trn2(p5.VnB(), p1.VnB(), p0.VnB());
13830 __ Trn1(p6.VnB(), p0.VnB(), p1.VnB());
13831 __ Trn2(p7.VnB(), p0.VnB(), p1.VnB());
13832
13833 __ Trn1(p8.VnH(), p0.VnH(), p0.VnH());
13834 __ Trn2(p9.VnH(), p0.VnH(), p0.VnH());
13835 __ Trn1(p10.VnH(), p1.VnH(), p0.VnH());
13836 __ Trn2(p11.VnH(), p1.VnH(), p0.VnH());
13837 __ Trn1(p12.VnH(), p0.VnH(), p1.VnH());
13838 __ Trn2(p13.VnH(), p0.VnH(), p1.VnH());
13839
13840 END();
13841
13842 if (CAN_RUN()) {
13843 RUN();
13844 int p2_expected[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
13845 int p3_expected[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
13846 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13847 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13848
13849 int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13850 int p5_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13851 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13852 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13853
13854 int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0};
13855 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
13856 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13857 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13858
13859 int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13860 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13861 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13862 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13863
13864 int p10_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13865 int p11_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13866 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13867 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13868
13869 int p12_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13870 int p13_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13871 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13872 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13873 }
13874}
13875
13876TEST_SVE(sve_trn_p_sd) {
13877 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13878 START();
13879
13880 Initialise(&masm, p0.VnB(), 0x55a55aaa);
13881 __ Pfalse(p1.VnB());
13882
13883 __ Trn1(p2.VnS(), p0.VnS(), p0.VnS());
13884 __ Trn2(p3.VnS(), p0.VnS(), p0.VnS());
13885 __ Trn1(p4.VnS(), p1.VnS(), p0.VnS());
13886 __ Trn2(p5.VnS(), p1.VnS(), p0.VnS());
13887 __ Trn1(p6.VnS(), p0.VnS(), p1.VnS());
13888 __ Trn2(p7.VnS(), p0.VnS(), p1.VnS());
13889
13890 __ Trn1(p8.VnD(), p0.VnD(), p0.VnD());
13891 __ Trn2(p9.VnD(), p0.VnD(), p0.VnD());
13892 __ Trn1(p10.VnD(), p1.VnD(), p0.VnD());
13893 __ Trn2(p11.VnD(), p1.VnD(), p0.VnD());
13894 __ Trn1(p12.VnD(), p0.VnD(), p1.VnD());
13895 __ Trn2(p13.VnD(), p0.VnD(), p1.VnD());
13896
13897 END();
13898
13899 if (CAN_RUN()) {
13900 RUN();
13901 int p2_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13902 int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13903 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13904 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13905
13906 int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13907 int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13908 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13909 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13910
13911 int p6_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13912 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13913 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13914 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13915
13916 int p8_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13917 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13918 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13919 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13920
13921 int p10_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13922 int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13923 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13924 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13925
13926 int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13927 int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13928 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13929 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13930 }
13931}
13932
13933TEST_SVE(sve_zip_p_bh) {
13934 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13935 START();
13936
13937 Initialise(&masm,
13938 p0.VnB(),
13939 0x5a5a5a5a5a5a5a5a,
13940 0x5a5a5a5a5a5a5a5a,
13941 0x5a5a5a5a5a5a5a5a,
13942 0x5a5a5a5a5a5a5a5a);
13943 __ Pfalse(p1.VnB());
13944
13945 __ Zip1(p2.VnB(), p0.VnB(), p0.VnB());
13946 __ Zip2(p3.VnB(), p0.VnB(), p0.VnB());
13947 __ Zip1(p4.VnB(), p1.VnB(), p0.VnB());
13948 __ Zip2(p5.VnB(), p1.VnB(), p0.VnB());
13949 __ Zip1(p6.VnB(), p0.VnB(), p1.VnB());
13950 __ Zip2(p7.VnB(), p0.VnB(), p1.VnB());
13951
13952 __ Zip1(p8.VnH(), p0.VnH(), p0.VnH());
13953 __ Zip2(p9.VnH(), p0.VnH(), p0.VnH());
13954 __ Zip1(p10.VnH(), p1.VnH(), p0.VnH());
13955 __ Zip2(p11.VnH(), p1.VnH(), p0.VnH());
13956 __ Zip1(p12.VnH(), p0.VnH(), p1.VnH());
13957 __ Zip2(p13.VnH(), p0.VnH(), p1.VnH());
13958
13959 END();
13960
13961 if (CAN_RUN()) {
13962 RUN();
13963 int p2_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13964 int p3_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13965 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13966 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13967
13968 int p4_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13969 int p5_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13970 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13971 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13972
13973 int p6_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13974 int p7_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13975 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13976 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13977
13978 int p8_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13979 int p9_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13980 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13981 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13982
13983 int p10_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13984 int p11_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13985 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13986 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13987
13988 int p12_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13989 int p13_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13990 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13991 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13992 }
13993}
13994
13995TEST_SVE(sve_zip_p_sd) {
13996 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13997 START();
13998
13999 Initialise(&masm,
14000 p0.VnB(),
14001 0x5a5a5a5a5a5a5a5a,
14002 0x5a5a5a5a5a5a5a5a,
14003 0x5a5a5a5a5a5a5a5a,
14004 0x5a5a5a5a5a5a5a5a);
14005 __ Pfalse(p1.VnB());
14006
14007 __ Zip1(p2.VnS(), p0.VnS(), p0.VnS());
14008 __ Zip2(p3.VnS(), p0.VnS(), p0.VnS());
14009 __ Zip1(p4.VnS(), p1.VnS(), p0.VnS());
14010 __ Zip2(p5.VnS(), p1.VnS(), p0.VnS());
14011 __ Zip1(p6.VnS(), p0.VnS(), p1.VnS());
14012 __ Zip2(p7.VnS(), p0.VnS(), p1.VnS());
14013
14014 __ Zip1(p8.VnD(), p0.VnD(), p0.VnD());
14015 __ Zip2(p9.VnD(), p0.VnD(), p0.VnD());
14016 __ Zip1(p10.VnD(), p1.VnD(), p0.VnD());
14017 __ Zip2(p11.VnD(), p1.VnD(), p0.VnD());
14018 __ Zip1(p12.VnD(), p0.VnD(), p1.VnD());
14019 __ Zip2(p13.VnD(), p0.VnD(), p1.VnD());
14020
14021 END();
14022
14023 if (CAN_RUN()) {
14024 RUN();
14025 int p2_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
14026 int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
14027 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
14028 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
14029
14030 int p4_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
14031 int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
14032 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
14033 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
14034
14035 int p6_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
14036 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
14037 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
14038 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
14039
14040 int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14041 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14042 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
14043 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
14044
14045 int p10_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14046 int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14047 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
14048 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
14049
14050 int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14051 int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14052 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
14053 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
14054 }
14055}
14056
14057TEST_SVE(sve_uzp_p) {
14058 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14059 START();
14060
14061 Initialise(&masm,
14062 p0.VnB(),
14063 0xf0f0ff00ffff0000,
14064 0x4242424242424242,
14065 0x5a5a5a5a5a5a5a5a,
14066 0x0123456789abcdef);
14067 __ Rev(p1.VnB(), p0.VnB());
14068
14069 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
14070 __ Zip2(p3.VnB(), p0.VnB(), p1.VnB());
14071 __ Uzp1(p4.VnB(), p2.VnB(), p3.VnB());
14072 __ Uzp2(p5.VnB(), p2.VnB(), p3.VnB());
14073
14074 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());
14075 __ Zip2(p3.VnH(), p0.VnH(), p1.VnH());
14076 __ Uzp1(p6.VnH(), p2.VnH(), p3.VnH());
14077 __ Uzp2(p7.VnH(), p2.VnH(), p3.VnH());
14078
14079 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14080 __ Zip2(p3.VnS(), p0.VnS(), p1.VnS());
14081 __ Uzp1(p8.VnS(), p2.VnS(), p3.VnS());
14082 __ Uzp2(p9.VnS(), p2.VnS(), p3.VnS());
14083
14084 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14085 __ Zip2(p3.VnD(), p0.VnD(), p1.VnD());
14086 __ Uzp1(p10.VnD(), p2.VnD(), p3.VnD());
14087 __ Uzp2(p11.VnD(), p2.VnD(), p3.VnD());
14088
14089 END();
14090
14091 if (CAN_RUN()) {
14092 RUN();
14093
14094 ASSERT_EQUAL_SVE(p0, p4);
14095 ASSERT_EQUAL_SVE(p1, p5);
14096 ASSERT_EQUAL_SVE(p0, p6);
14097 ASSERT_EQUAL_SVE(p1, p7);
14098 ASSERT_EQUAL_SVE(p0, p8);
14099 ASSERT_EQUAL_SVE(p1, p9);
14100 ASSERT_EQUAL_SVE(p0, p10);
14101 ASSERT_EQUAL_SVE(p1, p11);
14102 }
14103}
14104
14105TEST_SVE(sve_punpk) {
14106 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14107 START();
14108
Jacob Bramley3980b742020-07-01 12:25:54 +010014109 auto get_64_bits_at = [](int byte_index) -> uint64_t {
14110 // Each 8-bit chunk has the value 0x50 + the byte index of the chunk.
14111 return 0x5756555453525150 + (0x0101010101010101 * byte_index);
14112 };
14113
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000014114 Initialise(&masm,
14115 p0.VnB(),
Jacob Bramley3980b742020-07-01 12:25:54 +010014116 get_64_bits_at(24),
14117 get_64_bits_at(16),
14118 get_64_bits_at(8),
14119 get_64_bits_at(0));
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000014120 __ Punpklo(p1.VnH(), p0.VnB());
14121 __ Punpkhi(p2.VnH(), p0.VnB());
14122
14123 END();
14124
14125 if (CAN_RUN()) {
14126 RUN();
14127
Jacob Bramley3980b742020-07-01 12:25:54 +010014128 int pl = config->sve_vl_in_bits() / kZRegBitsPerPRegBit;
14129 // For simplicity, just test the bottom 64 H-sized lanes.
14130 uint64_t p1_h_bits = get_64_bits_at(0);
14131 uint64_t p2_h_bits = get_64_bits_at(pl / (2 * 8));
14132 int p1_expected[64];
14133 int p2_expected[64];
14134 for (size_t i = 0; i < 64; i++) {
14135 p1_expected[63 - i] = (p1_h_bits >> i) & 1;
14136 p2_expected[63 - i] = (p2_h_bits >> i) & 1;
14137 }
14138 // Testing `VnH` ensures that odd-numbered B lanes are zero.
14139 ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
14140 ASSERT_EQUAL_SVE(p2_expected, p2.VnH());
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000014141 }
14142}
14143
TatWai Chong5d872292020-01-02 15:39:51 -080014144typedef void (MacroAssembler::*BrkFn)(const PRegisterWithLaneSize& pd,
14145 const PRegister& pg,
14146 const PRegisterWithLaneSize& pn);
14147
14148typedef void (MacroAssembler::*BrksFn)(const PRegisterWithLaneSize& pd,
14149 const PRegisterZ& pg,
14150 const PRegisterWithLaneSize& pn);
14151
14152template <typename T, size_t N>
14153static void BrkaBrkbHelper(Test* config,
14154 BrkFn macro,
14155 BrksFn macro_set_flags,
14156 const T (&pd_inputs)[N],
14157 const T (&pg_inputs)[N],
14158 const T (&pn_inputs)[N],
14159 const T (&pd_z_expected)[N]) {
14160 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14161 START();
14162
14163 PRegister pg = p10;
14164 PRegister pn = p9;
14165 PRegister pd_z = p0;
14166 PRegister pd_z_s = p1;
14167 PRegister pd_m = p2;
14168 Initialise(&masm, pg.VnB(), pg_inputs);
14169 Initialise(&masm, pn.VnB(), pn_inputs);
14170 Initialise(&masm, pd_m.VnB(), pd_inputs);
14171
14172 // Initialise NZCV to an impossible value, to check that we actually write it.
14173 __ Mov(x10, NZCVFlag);
14174 __ Msr(NZCV, x10);
14175
14176 (masm.*macro)(pd_z.VnB(), pg.Zeroing(), pn.VnB());
14177 (masm.*macro_set_flags)(pd_z_s.VnB(), pg.Zeroing(), pn.VnB());
14178 __ Mrs(x0, NZCV);
14179
14180 (masm.*macro)(pd_m.VnB(), pg.Merging(), pn.VnB());
14181
14182 END();
14183
14184 if (CAN_RUN()) {
14185 RUN();
14186
14187 ASSERT_EQUAL_SVE(pd_z_expected, pd_z.VnB());
14188
14189 // Check that the flags were properly set.
14190 StatusFlags nzcv_expected =
14191 GetPredTestFlags(pd_z_expected,
14192 pg_inputs,
14193 core.GetSVELaneCount(kBRegSize));
14194 ASSERT_EQUAL_64(nzcv_expected, x0);
14195 ASSERT_EQUAL_SVE(pd_z.VnB(), pd_z_s.VnB());
14196
14197 T pd_m_expected[N];
14198 // Set expected `pd` result on merging predication.
14199 for (size_t i = 0; i < N; i++) {
14200 pd_m_expected[i] = pg_inputs[i] ? pd_z_expected[i] : pd_inputs[i];
14201 }
14202 ASSERT_EQUAL_SVE(pd_m_expected, pd_m.VnB());
14203 }
14204}
14205
14206template <typename T>
14207static void BrkaHelper(Test* config,
14208 const T& pd_inputs,
14209 const T& pg_inputs,
14210 const T& pn_inputs,
14211 const T& pd_expected) {
14212 BrkaBrkbHelper(config,
14213 &MacroAssembler::Brka,
14214 &MacroAssembler::Brkas,
14215 pd_inputs,
14216 pg_inputs,
14217 pn_inputs,
14218 pd_expected);
14219}
14220
14221TEST_SVE(sve_brka) {
14222 // clang-format off
14223 // | boundary of 128-bits VL.
14224 // v
14225 int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14226
14227 // | highest-numbered lane lowest-numbered lane |
14228 // v v
14229 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14230 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14231
14232 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14233 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14234 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14235
14236 // | first break
14237 // v
14238 int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
14239 // | first break
14240 // v
14241 int exp_1_2[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14242 // | first break
14243 // v
14244 int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14245
14246 BrkaHelper(config, pd, pg_1, pn_1, exp_1_1);
14247 BrkaHelper(config, pd, pg_1, pn_2, exp_1_2);
14248 BrkaHelper(config, pd, pg_1, pn_3, exp_1_3);
14249
14250 // | first break
14251 // v
14252 int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
14253 // | first break
14254 // v
14255 int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14256 // | first break
14257 // v
14258 int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
14259 BrkaHelper(config, pd, pg_2, pn_1, exp_2_1);
14260 BrkaHelper(config, pd, pg_2, pn_2, exp_2_2);
14261 BrkaHelper(config, pd, pg_2, pn_3, exp_2_3);
14262
14263 // The all-inactive zeroing predicate sets destination predicate all-false.
14264 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14265 int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14266 BrkaHelper(config, pd, pg_3, pn_1, exp_3_x);
14267 BrkaHelper(config, pd, pg_3, pn_2, exp_3_x);
14268 BrkaHelper(config, pd, pg_3, pn_3, exp_3_x);
14269 // clang-format on
14270}
14271
14272template <typename T>
14273static void BrkbHelper(Test* config,
14274 const T& pd_inputs,
14275 const T& pg_inputs,
14276 const T& pn_inputs,
14277 const T& pd_expected) {
14278 BrkaBrkbHelper(config,
14279 &MacroAssembler::Brkb,
14280 &MacroAssembler::Brkbs,
14281 pd_inputs,
14282 pg_inputs,
14283 pn_inputs,
14284 pd_expected);
14285}
14286
14287TEST_SVE(sve_brkb) {
14288 // clang-format off
14289 // | boundary of 128-bits VL.
14290 // v
14291 int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14292
14293 // | highest-numbered lane lowest-numbered lane |
14294 // v v
14295 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14296 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14297
14298 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14299 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14300 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14301
14302 // | first break
14303 // v
14304 int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
14305 // | first break
14306 // v
14307 int exp_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14308 // | first break
14309 // v
14310 int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
14311
14312 BrkbHelper(config, pd, pg_1, pn_1, exp_1_1);
14313 BrkbHelper(config, pd, pg_1, pn_2, exp_1_2);
14314 BrkbHelper(config, pd, pg_1, pn_3, exp_1_3);
14315
14316 // | first break
14317 // v
14318 int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
14319 // | first break
14320 // v
14321 int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14322 // | first break
14323 // v
14324 int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14325 BrkbHelper(config, pd, pg_2, pn_1, exp_2_1);
14326 BrkbHelper(config, pd, pg_2, pn_2, exp_2_2);
14327 BrkbHelper(config, pd, pg_2, pn_3, exp_2_3);
14328
14329 // The all-inactive zeroing predicate sets destination predicate all-false.
14330 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14331 int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14332 BrkbHelper(config, pd, pg_3, pn_1, exp_3_x);
14333 BrkbHelper(config, pd, pg_3, pn_2, exp_3_x);
14334 BrkbHelper(config, pd, pg_3, pn_3, exp_3_x);
14335 // clang-format on
14336}
14337
14338typedef void (MacroAssembler::*BrknFn)(const PRegisterWithLaneSize& pd,
14339 const PRegisterZ& pg,
14340 const PRegisterWithLaneSize& pn,
14341 const PRegisterWithLaneSize& pm);
14342
14343typedef void (MacroAssembler::*BrknsFn)(const PRegisterWithLaneSize& pd,
14344 const PRegisterZ& pg,
14345 const PRegisterWithLaneSize& pn,
14346 const PRegisterWithLaneSize& pm);
14347
14348enum BrknDstPredicateState { kAllFalse, kUnchanged };
14349
14350template <typename T, size_t N>
14351static void BrknHelper(Test* config,
TatWai Chong5d872292020-01-02 15:39:51 -080014352 const T (&pd_inputs)[N],
14353 const T (&pg_inputs)[N],
14354 const T (&pn_inputs)[N],
14355 const T (&pm_inputs)[N],
14356 BrknDstPredicateState expected_pd_state) {
14357 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14358 START();
14359
14360 PRegister pg = p10;
14361 PRegister pn = p9;
14362 PRegister pm = p8;
14363 PRegister pdm = p0;
14364 PRegister pd = p1;
14365 PRegister pd_s = p2;
14366 Initialise(&masm, pg.VnB(), pg_inputs);
14367 Initialise(&masm, pn.VnB(), pn_inputs);
14368 Initialise(&masm, pm.VnB(), pm_inputs);
14369 Initialise(&masm, pdm.VnB(), pm_inputs);
14370 Initialise(&masm, pd.VnB(), pd_inputs);
14371 Initialise(&masm, pd_s.VnB(), pd_inputs);
14372
14373 // Initialise NZCV to an impossible value, to check that we actually write it.
14374 __ Mov(x10, NZCVFlag);
14375 __ Msr(NZCV, x10);
14376
Jacob Bramleya3d61102020-07-01 16:49:47 +010014377 __ Brkn(pdm.VnB(), pg.Zeroing(), pn.VnB(), pdm.VnB());
TatWai Chong5d872292020-01-02 15:39:51 -080014378 // !pd.Aliases(pm).
Jacob Bramleya3d61102020-07-01 16:49:47 +010014379 __ Brkn(pd.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
14380 __ Brkns(pd_s.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
TatWai Chong5d872292020-01-02 15:39:51 -080014381 __ Mrs(x0, NZCV);
14382
14383 END();
14384
14385 if (CAN_RUN()) {
14386 RUN();
14387
14388 T all_false[N] = {0};
14389 if (expected_pd_state == kAllFalse) {
14390 ASSERT_EQUAL_SVE(all_false, pd.VnB());
14391 } else {
14392 ASSERT_EQUAL_SVE(pm_inputs, pd.VnB());
14393 }
14394 ASSERT_EQUAL_SVE(pm_inputs, pm.VnB());
14395
Jacob Bramleya3d61102020-07-01 16:49:47 +010014396 T all_true[N];
14397 for (size_t i = 0; i < ArrayLength(all_true); i++) {
14398 all_true[i] = 1;
14399 }
14400
TatWai Chong5d872292020-01-02 15:39:51 -080014401 // Check that the flags were properly set.
14402 StatusFlags nzcv_expected =
14403 GetPredTestFlags((expected_pd_state == kAllFalse) ? all_false
14404 : pm_inputs,
Jacob Bramleya3d61102020-07-01 16:49:47 +010014405 all_true,
TatWai Chong5d872292020-01-02 15:39:51 -080014406 core.GetSVELaneCount(kBRegSize));
14407 ASSERT_EQUAL_64(nzcv_expected, x0);
14408 ASSERT_EQUAL_SVE(pd.VnB(), pdm.VnB());
14409 ASSERT_EQUAL_SVE(pd.VnB(), pd_s.VnB());
14410 }
14411}
14412
14413TEST_SVE(sve_brkn) {
Jacob Bramleya3d61102020-07-01 16:49:47 +010014414 int pd[] = {1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14415 int pm[] = {0, 1, 1, 1, 1, 0, 0, 1, 0, 1};
TatWai Chong5d872292020-01-02 15:39:51 -080014416
Jacob Bramleya3d61102020-07-01 16:49:47 +010014417 int pg_1[] = {1, 1, 0, 0, 1, 0, 1, 1, 0, 0};
14418 int pg_2[] = {0, 0, 0, 1, 1, 1, 0, 0, 1, 1};
14419 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
TatWai Chong5d872292020-01-02 15:39:51 -080014420
Jacob Bramleya3d61102020-07-01 16:49:47 +010014421 int pn_1[] = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0};
14422 int pn_2[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
14423 int pn_3[] = {0, 0, 0, 0, 1, 1, 0, 0, 1, 1};
TatWai Chong5d872292020-01-02 15:39:51 -080014424
Jacob Bramleya3d61102020-07-01 16:49:47 +010014425 BrknHelper(config, pd, pg_1, pn_1, pm, kUnchanged);
14426 BrknHelper(config, pd, pg_1, pn_2, pm, kAllFalse);
14427 BrknHelper(config, pd, pg_1, pn_3, pm, kAllFalse);
TatWai Chong5d872292020-01-02 15:39:51 -080014428
Jacob Bramleya3d61102020-07-01 16:49:47 +010014429 BrknHelper(config, pd, pg_2, pn_1, pm, kAllFalse);
14430 BrknHelper(config, pd, pg_2, pn_2, pm, kUnchanged);
14431 BrknHelper(config, pd, pg_2, pn_3, pm, kAllFalse);
TatWai Chong5d872292020-01-02 15:39:51 -080014432
Jacob Bramleya3d61102020-07-01 16:49:47 +010014433 BrknHelper(config, pd, pg_3, pn_1, pm, kAllFalse);
14434 BrknHelper(config, pd, pg_3, pn_2, pm, kAllFalse);
14435 BrknHelper(config, pd, pg_3, pn_3, pm, kAllFalse);
TatWai Chong5d872292020-01-02 15:39:51 -080014436}
14437
Martyn Capewell15f89012020-01-09 11:18:30 +000014438TEST_SVE(sve_trn) {
14439 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14440 START();
14441
14442 uint64_t in0[] = {0xffeeddccbbaa9988, 0x7766554433221100};
14443 uint64_t in1[] = {0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa};
14444 InsrHelper(&masm, z0.VnD(), in0);
14445 InsrHelper(&masm, z1.VnD(), in1);
14446
14447 __ Trn1(z2.VnB(), z0.VnB(), z1.VnB());
14448 __ Trn2(z3.VnB(), z0.VnB(), z1.VnB());
14449 __ Trn1(z4.VnH(), z0.VnH(), z1.VnH());
14450 __ Trn2(z5.VnH(), z0.VnH(), z1.VnH());
14451 __ Trn1(z6.VnS(), z0.VnS(), z1.VnS());
14452 __ Trn2(z7.VnS(), z0.VnS(), z1.VnS());
14453 __ Trn1(z8.VnD(), z0.VnD(), z1.VnD());
14454 __ Trn2(z9.VnD(), z0.VnD(), z1.VnD());
14455
14456 END();
14457
14458 if (CAN_RUN()) {
14459 RUN();
14460 uint64_t expected_z2[] = {0x55ee55cc55aa5588, 0xaa66aa44aa22aa00};
14461 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14462 uint64_t expected_z3[] = {0xaaffaaddaabbaa99, 0x5577555555335511};
14463 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14464 uint64_t expected_z4[] = {0xaa55ddccaa559988, 0x55aa554455aa1100};
14465 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14466 uint64_t expected_z5[] = {0xaa55ffeeaa55bbaa, 0x55aa776655aa3322};
14467 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14468 uint64_t expected_z6[] = {0xaa55aa55bbaa9988, 0x55aa55aa33221100};
14469 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14470 uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0x55aa55aa77665544};
14471 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14472 uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14473 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14474 uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14475 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14476 }
14477}
14478
14479TEST_SVE(sve_zip_uzp) {
14480 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14481 START();
14482
14483 __ Dup(z0.VnD(), 0xffeeddccbbaa9988);
14484 __ Insr(z0.VnD(), 0x7766554433221100);
14485 __ Dup(z1.VnD(), 0xaa55aa55aa55aa55);
14486 __ Insr(z1.VnD(), 0x55aa55aa55aa55aa);
14487
14488 __ Zip1(z2.VnB(), z0.VnB(), z1.VnB());
14489 __ Zip2(z3.VnB(), z0.VnB(), z1.VnB());
14490 __ Zip1(z4.VnH(), z0.VnH(), z1.VnH());
14491 __ Zip2(z5.VnH(), z0.VnH(), z1.VnH());
14492 __ Zip1(z6.VnS(), z0.VnS(), z1.VnS());
14493 __ Zip2(z7.VnS(), z0.VnS(), z1.VnS());
14494 __ Zip1(z8.VnD(), z0.VnD(), z1.VnD());
14495 __ Zip2(z9.VnD(), z0.VnD(), z1.VnD());
14496
14497 __ Uzp1(z10.VnB(), z2.VnB(), z3.VnB());
14498 __ Uzp2(z11.VnB(), z2.VnB(), z3.VnB());
14499 __ Uzp1(z12.VnH(), z4.VnH(), z5.VnH());
14500 __ Uzp2(z13.VnH(), z4.VnH(), z5.VnH());
14501 __ Uzp1(z14.VnS(), z6.VnS(), z7.VnS());
14502 __ Uzp2(z15.VnS(), z6.VnS(), z7.VnS());
14503 __ Uzp1(z16.VnD(), z8.VnD(), z9.VnD());
14504 __ Uzp2(z17.VnD(), z8.VnD(), z9.VnD());
14505
14506 END();
14507
14508 if (CAN_RUN()) {
14509 RUN();
14510 uint64_t expected_z2[] = {0x5577aa665555aa44, 0x5533aa225511aa00};
14511 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14512 uint64_t expected_z3[] = {0xaaff55eeaadd55cc, 0xaabb55aaaa995588};
14513 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14514 uint64_t expected_z4[] = {0x55aa776655aa5544, 0x55aa332255aa1100};
14515 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14516 uint64_t expected_z5[] = {0xaa55ffeeaa55ddcc, 0xaa55bbaaaa559988};
14517 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14518 uint64_t expected_z6[] = {0x55aa55aa77665544, 0x55aa55aa33221100};
14519 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14520 uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0xaa55aa55bbaa9988};
14521 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14522 uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14523 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14524 uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14525 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14526
14527 // Check uzp is the opposite of zip.
14528 ASSERT_EQUAL_SVE(z0.VnD(), z10.VnD());
14529 ASSERT_EQUAL_SVE(z1.VnD(), z11.VnD());
14530 ASSERT_EQUAL_SVE(z0.VnD(), z12.VnD());
14531 ASSERT_EQUAL_SVE(z1.VnD(), z13.VnD());
14532 ASSERT_EQUAL_SVE(z0.VnD(), z14.VnD());
14533 ASSERT_EQUAL_SVE(z1.VnD(), z15.VnD());
14534 ASSERT_EQUAL_SVE(z0.VnD(), z16.VnD());
14535 ASSERT_EQUAL_SVE(z1.VnD(), z17.VnD());
14536 }
14537}
Martyn Capewell50e9f552020-01-07 17:45:03 +000014538
Martyn Capewell0b1afa82020-03-04 11:31:42 +000014539TEST_SVE(sve_fcadd) {
14540 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14541 START();
14542
14543 __ Dup(z30.VnS(), 0);
14544
14545 __ Ptrue(p0.VnB());
14546 __ Pfalse(p1.VnB());
14547 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
14548 __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
14549
14550 __ Fdup(z0.VnH(), 10.0); // 10i + 10
14551 __ Fdup(z1.VnH(), 5.0); // 5i + 5
14552 __ Index(z7.VnH(), 1, 1);
14553 __ Scvtf(z7.VnH(), p0.Merging(), z7.VnH()); // Ai + B
14554
14555 __ Sel(z2.VnH(), p3, z1.VnH(), z30.VnH()); // 5i + 0
14556 __ Sel(z3.VnH(), p2, z1.VnH(), z30.VnH()); // 0i + 5
14557 __ Sel(z7.VnH(), p3, z7.VnH(), z0.VnH()); // Ai + 10
14558 __ Ext(z8.VnB(), z7.VnB(), z7.VnB(), 2);
14559 __ Sel(z8.VnH(), p2, z8.VnH(), z30.VnH()); // 0i + A
14560
14561 // (10i + 10) + rotate(5i + 0, 90)
14562 // = (10i + 10) + (0i - 5)
14563 // = 10i + 5
14564 __ Fcadd(z4.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90);
14565
14566 // (10i + 5) + rotate(0i + 5, 270)
14567 // = (10i + 5) + (-5i + 0)
14568 // = 5i + 5
14569 __ Fcadd(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH(), 270);
14570
14571 // The same calculation, but selecting real/imaginary using predication.
14572 __ Mov(z5, z0);
14573 __ Fcadd(z5.VnH(), p2.Merging(), z5.VnH(), z1.VnH(), 90);
14574 __ Fcadd(z5.VnH(), p3.Merging(), z5.VnH(), z1.VnH(), 270);
14575
14576 // Reference calculation: (10i + 10) - (5i + 5)
14577 __ Fsub(z6.VnH(), z0.VnH(), z1.VnH());
14578
14579 // Calculation using varying imaginary values.
14580 // (Ai + 10) + rotate(5i + 0, 90)
14581 // = (Ai + 10) + (0i - 5)
14582 // = Ai + 5
14583 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z2.VnH(), 90);
14584
14585 // (Ai + 5) + rotate(0i + A, 270)
14586 // = (Ai + 5) + (-Ai + 0)
14587 // = 5
14588 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z8.VnH(), 270);
14589
14590 // Repeated, but for wider elements.
14591 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14592 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14593 __ Fdup(z0.VnS(), 42.0);
14594 __ Fdup(z1.VnS(), 21.0);
14595 __ Index(z11.VnS(), 1, 1);
14596 __ Scvtf(z11.VnS(), p0.Merging(), z11.VnS());
14597 __ Sel(z2.VnS(), p3, z1.VnS(), z30.VnS());
14598 __ Sel(z29.VnS(), p2, z1.VnS(), z30.VnS());
14599 __ Sel(z11.VnS(), p3, z11.VnS(), z0.VnS());
14600 __ Ext(z12.VnB(), z11.VnB(), z11.VnB(), 4);
14601 __ Sel(z12.VnS(), p2, z12.VnS(), z30.VnS());
14602 __ Fcadd(z8.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90);
14603 __ Fcadd(z8.VnS(), p0.Merging(), z8.VnS(), z29.VnS(), 270);
14604 __ Mov(z9, z0);
14605 __ Fcadd(z9.VnS(), p2.Merging(), z9.VnS(), z1.VnS(), 90);
14606 __ Fcadd(z9.VnS(), p3.Merging(), z9.VnS(), z1.VnS(), 270);
14607 __ Fsub(z10.VnS(), z0.VnS(), z1.VnS());
14608 __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z2.VnS(), 90);
14609 __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z12.VnS(), 270);
14610
14611 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14612 __ Zip1(p3.VnD(), p1.VnD(), p0.VnD());
14613 __ Fdup(z0.VnD(), -42.0);
14614 __ Fdup(z1.VnD(), -21.0);
14615 __ Index(z15.VnD(), 1, 1);
14616 __ Scvtf(z15.VnD(), p0.Merging(), z15.VnD());
14617 __ Sel(z2.VnD(), p3, z1.VnD(), z30.VnD());
14618 __ Sel(z28.VnD(), p2, z1.VnD(), z30.VnD());
14619 __ Sel(z15.VnD(), p3, z15.VnD(), z0.VnD());
14620 __ Ext(z16.VnB(), z15.VnB(), z15.VnB(), 8);
14621 __ Sel(z16.VnD(), p2, z16.VnD(), z30.VnD());
14622 __ Fcadd(z12.VnD(), p0.Merging(), z0.VnD(), z2.VnD(), 90);
14623 __ Fcadd(z12.VnD(), p0.Merging(), z12.VnD(), z28.VnD(), 270);
14624 __ Mov(z13, z0);
14625 __ Fcadd(z13.VnD(), p2.Merging(), z13.VnD(), z1.VnD(), 90);
14626 __ Fcadd(z13.VnD(), p3.Merging(), z13.VnD(), z1.VnD(), 270);
14627 __ Fsub(z14.VnD(), z0.VnD(), z1.VnD());
14628 __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z2.VnD(), 90);
14629 __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z16.VnD(), 270);
14630 END();
14631
14632 if (CAN_RUN()) {
14633 RUN();
14634 ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14635 ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14636 ASSERT_EQUAL_SVE(z3.VnH(), z7.VnH());
14637 ASSERT_EQUAL_SVE(z10.VnS(), z8.VnS());
14638 ASSERT_EQUAL_SVE(z10.VnS(), z9.VnS());
14639 ASSERT_EQUAL_SVE(z29.VnS(), z11.VnS());
14640 ASSERT_EQUAL_SVE(z14.VnD(), z12.VnD());
14641 ASSERT_EQUAL_SVE(z14.VnD(), z13.VnD());
14642 ASSERT_EQUAL_SVE(z28.VnS(), z15.VnS());
14643 }
14644}
14645
Martyn Capewelle4886e52020-03-30 09:28:52 +010014646TEST_SVE(sve_fcmla_index) {
14647 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14648 START();
14649
14650 __ Ptrue(p0.VnB());
14651
14652 __ Fdup(z0.VnH(), 10.0);
14653 __ Fdup(z2.VnH(), 2.0);
14654 __ Zip1(z0.VnH(), z0.VnH(), z2.VnH());
14655
14656 // Duplicate complex numbers across z2 segments. First segment has 1i+0,
14657 // second has 3i+2, etc.
14658 __ Index(z1.VnH(), 0, 1);
14659 __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14660 __ Zip1(z2.VnS(), z1.VnS(), z1.VnS());
14661 __ Zip1(z2.VnS(), z2.VnS(), z2.VnS());
14662
14663 // Derive a vector from z2 where only the third element in each segment
14664 // contains a complex number, with other elements zero.
14665 __ Index(z3.VnS(), 0, 1);
14666 __ And(z3.VnS(), z3.VnS(), 3);
14667 __ Cmpeq(p2.VnS(), p0.Zeroing(), z3.VnS(), 2);
14668 __ Dup(z3.VnB(), 0);
14669 __ Sel(z3.VnS(), p2, z2.VnS(), z3.VnS());
14670
14671 // Use indexed complex multiply on this vector, indexing the third element.
14672 __ Dup(z4.VnH(), 0);
14673 __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 0);
14674 __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 90);
14675
14676 // Rotate the indexed complex number and repeat, negated, and with a different
14677 // index.
14678 __ Ext(z3.VnH(), z3.VnH(), z3.VnH(), 4);
14679 __ Dup(z5.VnH(), 0);
14680 __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 180);
14681 __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 270);
14682 __ Fneg(z5.VnH(), p0.Merging(), z5.VnH());
14683
14684 // Create a reference result from a vector complex multiply.
14685 __ Dup(z6.VnH(), 0);
14686 __ Fcmla(z6.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 0);
14687 __ Fcmla(z6.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90);
14688
14689 // Repeated, but for wider elements.
14690 __ Fdup(z0.VnS(), 42.0);
14691 __ Fdup(z2.VnS(), 24.0);
14692 __ Zip1(z0.VnS(), z0.VnS(), z2.VnS());
14693 __ Index(z1.VnS(), -42, 13);
14694 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14695 __ Zip1(z2.VnD(), z1.VnD(), z1.VnD());
14696 __ Zip1(z2.VnD(), z2.VnD(), z2.VnD());
14697 __ Index(z3.VnD(), 0, 1);
14698 __ And(z3.VnD(), z3.VnD(), 1);
14699 __ Cmpeq(p2.VnD(), p0.Zeroing(), z3.VnD(), 1);
14700 __ Dup(z3.VnB(), 0);
14701 __ Sel(z3.VnD(), p2, z2.VnD(), z3.VnD());
14702 __ Dup(z7.VnS(), 0);
14703 __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 0);
14704 __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 90);
14705 __ Ext(z3.VnB(), z3.VnB(), z3.VnB(), 8);
14706 __ Dup(z8.VnS(), 0);
14707 __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 180);
14708 __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 270);
14709 __ Fneg(z8.VnS(), p0.Merging(), z8.VnS());
14710 __ Dup(z9.VnS(), 0);
14711 __ Fcmla(z9.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 0);
14712 __ Fcmla(z9.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90);
14713 END();
14714
14715 if (CAN_RUN()) {
14716 RUN();
14717 ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14718 ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14719 ASSERT_EQUAL_SVE(z9.VnS(), z7.VnS());
14720 ASSERT_EQUAL_SVE(z9.VnS(), z8.VnS());
14721 }
14722}
14723
Martyn Capewell75f1c432020-03-30 09:23:27 +010014724TEST_SVE(sve_fcmla) {
14725 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14726 START();
14727
14728 __ Ptrue(p0.VnB());
14729 __ Pfalse(p1.VnB());
14730 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
14731 __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
14732
14733 __ Fdup(z0.VnH(), 10.0);
14734 __ Fdup(z2.VnH(), 2.0);
14735
14736 // Create pairs of complex numbers, Ai + A. A is chosen to be non-zero, as
14737 // the later fneg will result in a failed comparison otherwise.
14738 __ Index(z1.VnH(), -4, 3);
14739 __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14740 __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14741 __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14742
14743 __ Sel(z3.VnH(), p2, z0.VnH(), z1.VnH()); // Ai + 10
14744 __ Sel(z4.VnH(), p2, z1.VnH(), z2.VnH()); // 2i + A
14745
14746 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); // Even complex numbers.
14747 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS()); // Odd complex numbers.
14748
14749 // Calculate (Ai + 10) * (2i + A) = (20 + A^2)i + 8A, using predication to
14750 // select only the complex numbers in odd-numbered element pairs. This leaves
14751 // results in elements 2/3, 6/7, etc. with zero in elements 0/1, 4/5, etc.
14752 // ... 7 6 5 4 3 2 1 0 <-- element
14753 // ... | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | 0 | 0 | <-- value
14754 __ Dup(z5.VnH(), 0);
14755 __ Fcmla(z5.VnH(), p3.Merging(), z4.VnH(), z3.VnH(), 0);
14756 __ Fcmla(z5.VnH(), p3.Merging(), z4.VnH(), z3.VnH(), 90);
14757
14758 // Move the odd results to the even result positions.
14759 // ... 7 6 5 4 3 2 1 0 <-- element
14760 // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
14761 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 4);
14762
14763 // Calculate -(Ai + 10) * (2i + A) = -(20 + A^2)i - 8A for the even complex
14764 // numbers.
14765 // ... 7 6 5 4 3 2 1 0 <-- element
14766 // ... | 0 | 0 | -20-A^2 | -8A | 0 | 0 | -20-A^2 | -8A | <-- value
14767 __ Dup(z6.VnH(), 0);
14768 __ Fcmla(z6.VnH(), p2.Merging(), z4.VnH(), z3.VnH(), 180);
14769 __ Fcmla(z6.VnH(), p2.Merging(), z4.VnH(), z3.VnH(), 270);
14770
14771 // Negate the even results. The results in z6 should now match the results
14772 // computed earlier in z5.
14773 // ... 7 6 5 4 3 2 1 0 <-- element
14774 // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
14775 __ Fneg(z6.VnH(), p2.Merging(), z6.VnH());
14776
14777
14778 // Similarly, but for wider elements.
14779 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14780 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14781 __ Index(z1.VnS(), -4, 3);
14782 __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14783 __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14784 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14785 __ Fdup(z0.VnS(), 20.0);
14786 __ Fdup(z2.VnS(), 21.0);
14787 __ Sel(z3.VnS(), p2, z0.VnS(), z1.VnS());
14788 __ Sel(z4.VnS(), p2, z1.VnS(), z2.VnS());
14789 __ Punpklo(p2.VnH(), p2.VnB());
14790 __ Punpklo(p3.VnH(), p3.VnB());
14791 __ Dup(z7.VnS(), 0);
14792 __ Fcmla(z7.VnS(), p3.Merging(), z4.VnS(), z3.VnS(), 0);
14793 __ Fcmla(z7.VnS(), p3.Merging(), z4.VnS(), z3.VnS(), 90);
14794 __ Ext(z7.VnB(), z7.VnB(), z7.VnB(), 8);
14795 __ Dup(z8.VnS(), 0);
14796 __ Fcmla(z8.VnS(), p2.Merging(), z4.VnS(), z3.VnS(), 180);
14797 __ Fcmla(z8.VnS(), p2.Merging(), z4.VnS(), z3.VnS(), 270);
14798 __ Fneg(z8.VnS(), p2.Merging(), z8.VnS());
14799
14800 // Double precision computed for even lanes only.
14801 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14802 __ Index(z1.VnD(), -4, 3);
14803 __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14804 __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14805 __ Scvtf(z1.VnD(), p0.Merging(), z1.VnD());
14806 __ Fdup(z0.VnD(), 20.0);
14807 __ Fdup(z2.VnD(), 21.0);
14808 __ Sel(z3.VnD(), p2, z0.VnD(), z1.VnD());
14809 __ Sel(z4.VnD(), p2, z1.VnD(), z2.VnD());
14810 __ Punpklo(p2.VnH(), p2.VnB());
14811 __ Dup(z9.VnD(), 0);
14812 __ Fcmla(z9.VnD(), p2.Merging(), z4.VnD(), z3.VnD(), 0);
14813 __ Fcmla(z9.VnD(), p2.Merging(), z4.VnD(), z3.VnD(), 90);
14814 __ Dup(z10.VnD(), 0);
14815 __ Fcmla(z10.VnD(), p2.Merging(), z4.VnD(), z3.VnD(), 180);
14816 __ Fcmla(z10.VnD(), p2.Merging(), z4.VnD(), z3.VnD(), 270);
14817 __ Fneg(z10.VnD(), p2.Merging(), z10.VnD());
14818 END();
14819
14820 if (CAN_RUN()) {
14821 RUN();
14822 ASSERT_EQUAL_SVE(z5.VnH(), z6.VnH());
14823 ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
14824 ASSERT_EQUAL_SVE(z9.VnD(), z10.VnD());
14825 }
14826}
14827
Martyn Capewell50e9f552020-01-07 17:45:03 +000014828TEST_SVE(sve_fpmul_index) {
14829 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14830 START();
14831
14832 uint64_t in0[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
14833 uint64_t in1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
14834
14835 InsrHelper(&masm, z0.VnD(), in0);
14836 InsrHelper(&masm, z1.VnD(), in1);
14837
14838 __ Fmul(z2.VnH(), z1.VnH(), z0.VnH(), 0);
14839 __ Fmul(z3.VnH(), z1.VnH(), z0.VnH(), 1);
14840 __ Fmul(z4.VnH(), z1.VnH(), z0.VnH(), 4);
14841 __ Fmul(z5.VnH(), z1.VnH(), z0.VnH(), 7);
14842
14843 __ Fmul(z6.VnS(), z1.VnS(), z0.VnS(), 0);
14844 __ Fmul(z7.VnS(), z1.VnS(), z0.VnS(), 1);
14845 __ Fmul(z8.VnS(), z1.VnS(), z0.VnS(), 2);
14846 __ Fmul(z9.VnS(), z1.VnS(), z0.VnS(), 3);
14847
14848 __ Fmul(z10.VnD(), z1.VnD(), z0.VnD(), 0);
14849 __ Fmul(z11.VnD(), z1.VnD(), z0.VnD(), 1);
14850
14851 // Compute the results using other instructions.
14852 __ Dup(z12.VnH(), z0.VnH(), 0);
14853 __ Fmul(z12.VnH(), z1.VnH(), z12.VnH());
14854 __ Dup(z13.VnH(), z0.VnH(), 1);
14855 __ Fmul(z13.VnH(), z1.VnH(), z13.VnH());
14856 __ Dup(z14.VnH(), z0.VnH(), 4);
14857 __ Fmul(z14.VnH(), z1.VnH(), z14.VnH());
14858 __ Dup(z15.VnH(), z0.VnH(), 7);
14859 __ Fmul(z15.VnH(), z1.VnH(), z15.VnH());
14860
14861 __ Dup(z16.VnS(), z0.VnS(), 0);
14862 __ Fmul(z16.VnS(), z1.VnS(), z16.VnS());
14863 __ Dup(z17.VnS(), z0.VnS(), 1);
14864 __ Fmul(z17.VnS(), z1.VnS(), z17.VnS());
14865 __ Dup(z18.VnS(), z0.VnS(), 2);
14866 __ Fmul(z18.VnS(), z1.VnS(), z18.VnS());
14867 __ Dup(z19.VnS(), z0.VnS(), 3);
14868 __ Fmul(z19.VnS(), z1.VnS(), z19.VnS());
14869
14870 __ Dup(z20.VnD(), z0.VnD(), 0);
14871 __ Fmul(z20.VnD(), z1.VnD(), z20.VnD());
14872 __ Dup(z21.VnD(), z0.VnD(), 1);
14873 __ Fmul(z21.VnD(), z1.VnD(), z21.VnD());
14874
14875 END();
14876
14877 if (CAN_RUN()) {
14878 RUN();
14879 ASSERT_EQUAL_SVE(z12.VnH(), z2.VnH());
14880 ASSERT_EQUAL_SVE(z13.VnH(), z3.VnH());
14881 ASSERT_EQUAL_SVE(z14.VnH(), z4.VnH());
14882 ASSERT_EQUAL_SVE(z15.VnH(), z5.VnH());
14883 ASSERT_EQUAL_SVE(z16.VnS(), z6.VnS());
14884 ASSERT_EQUAL_SVE(z17.VnS(), z7.VnS());
14885 ASSERT_EQUAL_SVE(z18.VnS(), z8.VnS());
14886 ASSERT_EQUAL_SVE(z19.VnS(), z9.VnS());
14887 ASSERT_EQUAL_SVE(z20.VnD(), z10.VnD());
14888 ASSERT_EQUAL_SVE(z21.VnD(), z11.VnD());
14889 }
14890}
14891
Martyn Capewell5fb2ad62020-01-10 14:08:27 +000014892TEST_SVE(sve_ftmad) {
14893 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14894 START();
14895
14896 uint64_t in_h0[] = {0x7c027e01fc02fe01,
14897 0x3c003c00bc00bc00,
14898 0x3c003c00bc00bc00};
14899 uint64_t in_h1[] = {0xfe01fc027e017e01,
14900 0x3c00bc003c00bc00,
14901 0x3c00bc003c00bc00};
14902 uint64_t in_s0[] = {0x7f800002ffc00001,
14903 0x3f8000003f800000,
14904 0xbf800000bf800000};
14905 uint64_t in_s1[] = {0xffc00001ffc00001,
14906 0x3f800000bf800000,
14907 0x3f800000bf800000};
14908 uint64_t in_d0[] = {0x7ff8000000000001,
14909 0x3ff0000000000000,
14910 0xbff0000000000000};
14911 uint64_t in_d1[] = {0xfff0000000000002,
14912 0xbff0000000000000,
14913 0x3ff0000000000000};
14914 InsrHelper(&masm, z0.VnD(), in_h0);
14915 InsrHelper(&masm, z1.VnD(), in_h1);
14916 InsrHelper(&masm, z2.VnD(), in_s0);
14917 InsrHelper(&masm, z3.VnD(), in_s1);
14918 InsrHelper(&masm, z4.VnD(), in_d0);
14919 InsrHelper(&masm, z5.VnD(), in_d1);
14920
14921 __ Mov(z6, z0);
14922 __ Ftmad(z6.VnH(), z6.VnH(), z1.VnH(), 0);
14923 __ Mov(z7, z0);
14924 __ Ftmad(z7.VnH(), z7.VnH(), z1.VnH(), 1);
14925 __ Mov(z8, z0);
14926 __ Ftmad(z8.VnH(), z8.VnH(), z1.VnH(), 2);
14927
14928 __ Mov(z9, z2);
14929 __ Ftmad(z9.VnS(), z9.VnS(), z3.VnS(), 0);
14930 __ Mov(z10, z2);
14931 __ Ftmad(z10.VnS(), z10.VnS(), z3.VnS(), 3);
14932 __ Mov(z11, z2);
14933 __ Ftmad(z11.VnS(), z11.VnS(), z3.VnS(), 4);
14934
14935 __ Mov(z12, z4);
14936 __ Ftmad(z12.VnD(), z12.VnD(), z5.VnD(), 0);
14937 __ Mov(z13, z4);
14938 __ Ftmad(z13.VnD(), z13.VnD(), z5.VnD(), 5);
14939 __ Mov(z14, z4);
14940 __ Ftmad(z14.VnD(), z14.VnD(), z5.VnD(), 7);
14941
14942 END();
14943
14944 if (CAN_RUN()) {
14945 RUN();
14946 uint64_t expected_z6[] = {0x7e027e02fe02fe01,
14947 0x4000400000000000,
14948 0x4000400000000000};
14949 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14950 uint64_t expected_z7[] = {0x7e027e02fe02fe01,
14951 0x3aab3800bcabbe00,
14952 0x3aab3800bcabbe00};
14953 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14954 uint64_t expected_z8[] = {0x7e027e02fe02fe01,
14955 0x3c083c2abbefbbac,
14956 0x3c083c2abbefbbac};
14957 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14958 uint64_t expected_z9[] = {0x7fc00002ffc00001,
14959 0x4000000040000000,
14960 0x0000000000000000};
14961 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14962 uint64_t expected_z10[] = {0x7fc00002ffc00001,
14963 0x3f7ff2ff3f7fa4fc,
14964 0xbf800680bf802d82};
14965 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
14966 uint64_t expected_z11[] = {0x7fc00002ffc00001,
14967 0x3f8000173f8000cd,
14968 0xbf7fffd2bf7ffe66};
14969 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
14970 uint64_t expected_z12[] = {0x7ff8000000000002,
14971 0x4000000000000000,
14972 0x0000000000000000};
14973 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
14974 uint64_t expected_z13[] = {0x7ff8000000000002,
14975 0x3fefffff6c0d846c,
14976 0xbff0000006b978ae};
14977 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
14978 uint64_t expected_z14[] = {0x7ff8000000000002,
14979 0x3feffffffffe708a,
14980 0xbff0000000000000};
14981 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
14982 }
14983}
14984
Martyn Capewell37f28182020-01-14 10:15:10 +000014985static void BasicFPArithHelper(MacroAssembler* masm,
14986 int lane_size_in_bits,
14987 const uint64_t (&inputs)[2],
14988 const uint64_t (&inputs_fmulx)[2],
14989 const uint64_t (&inputs_nans)[2]) {
14990 int ls = lane_size_in_bits;
14991
14992 for (int i = 0; i < 16; i++) {
14993 InsrHelper(masm, z0.VnD(), inputs);
14994 }
14995 ZRegister rvrs = z1.WithLaneSize(ls);
14996 masm->Rev(rvrs, z0.WithLaneSize(ls));
14997
14998 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
14999 Initialise(masm, p2.VnB(), pred);
15000 PRegisterM p2m = p2.Merging();
15001
15002 masm->Mov(z2, z0);
15003 masm->Fadd(z2.WithLaneSize(ls),
15004 p2m,
15005 z2.WithLaneSize(ls),
15006 rvrs,
15007 FastNaNPropagation);
15008 masm->Mov(z3, z0);
15009 masm->Fsub(z3.WithLaneSize(ls), p2m, z3.WithLaneSize(ls), rvrs);
15010 masm->Mov(z4, z0);
15011 masm->Fsub(z4.WithLaneSize(ls), p2m, rvrs, z4.WithLaneSize(ls));
15012 masm->Mov(z5, z0);
15013 masm->Fabd(z5.WithLaneSize(ls),
15014 p2m,
15015 z5.WithLaneSize(ls),
15016 rvrs,
15017 FastNaNPropagation);
15018 masm->Mov(z6, z0);
15019 masm->Fmul(z6.WithLaneSize(ls),
15020 p2m,
15021 z6.WithLaneSize(ls),
15022 rvrs,
15023 FastNaNPropagation);
15024
15025 for (int i = 0; i < 16; i++) {
15026 InsrHelper(masm, z7.VnD(), inputs_fmulx);
15027 }
15028 masm->Rev(z8.WithLaneSize(ls), z7.WithLaneSize(ls));
15029 masm->Fmulx(z7.WithLaneSize(ls),
15030 p2m,
15031 z7.WithLaneSize(ls),
15032 z8.WithLaneSize(ls),
15033 FastNaNPropagation);
15034
15035 InsrHelper(masm, z8.VnD(), inputs_nans);
15036 masm->Mov(z9, z8);
15037 masm->Fminnm(z9.WithLaneSize(ls),
15038 p2m,
15039 z9.WithLaneSize(ls),
15040 rvrs,
15041 FastNaNPropagation);
15042 masm->Mov(z10, z8);
15043 masm->Fmaxnm(z10.WithLaneSize(ls),
15044 p2m,
15045 z10.WithLaneSize(ls),
15046 rvrs,
15047 FastNaNPropagation);
15048}
15049
15050TEST_SVE(sve_fp_arith_pred_h) {
15051 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15052 START();
15053
15054 uint64_t inputs[] = {0x4800470046004500, 0x4400420040003c00};
15055 uint64_t inputs_fmulx[] = {0x7c00fc007c00fc00, 0x0000800000008000};
15056 uint64_t inputs_nans[] = {0x7fffffff7fffffff, 0x7bfffbff7fbbfbff};
15057
15058 BasicFPArithHelper(&masm, kHRegSize, inputs, inputs_fmulx, inputs_nans);
15059
15060 END();
15061
15062 if (CAN_RUN()) {
15063 RUN();
15064 uint64_t expected_z2[] = {0x4880488048804880, 0x4880420048804880};
15065 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15066 uint64_t expected_z3[] = {0x4700450042003c00, 0xbc004200c500c700};
15067 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15068 uint64_t expected_z4[] = {0xc700c500c200bc00, 0x3c00420045004700};
15069 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15070 uint64_t expected_z5[] = {0x4700450042003c00, 0x3c00420045004700};
15071 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15072 uint64_t expected_z6[] = {0x48004b004c804d00, 0x4d0042004b004800};
15073 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15074 uint64_t expected_z7[] = {0xc000c000c000c000, 0xc0008000c000c000};
15075 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15076 uint64_t expected_z9[] = {0x3c00400042004400, 0x4500fbff4700fbff};
15077 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15078 uint64_t expected_z10[] = {0x3c00400042004400, 0x7bfffbff47004800};
15079 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15080 }
15081}
15082
15083TEST_SVE(sve_fp_arith_pred_s) {
15084 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15085 START();
15086
15087 uint64_t inputs[] = {0x4080000040400000, 0x400000003f800000};
15088 uint64_t inputs_fmulx[] = {0x7f800000ff800000, 0x0000000080000000};
15089 uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x41000000c1000000};
15090
15091 BasicFPArithHelper(&masm, kSRegSize, inputs, inputs_fmulx, inputs_nans);
15092
15093 END();
15094
15095 if (CAN_RUN()) {
15096 RUN();
15097 uint64_t expected_z2[] = {0x40a0000040a00000, 0x4000000040a00000};
15098 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15099 uint64_t expected_z3[] = {0x404000003f800000, 0x40000000c0400000};
15100 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15101 uint64_t expected_z4[] = {0xc0400000bf800000, 0x4000000040400000};
15102 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15103 uint64_t expected_z5[] = {0x404000003f800000, 0x4000000040400000};
15104 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15105 uint64_t expected_z6[] = {0x4080000040c00000, 0x4000000040800000};
15106 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15107 uint64_t expected_z7[] = {0xc0000000c0000000, 0x00000000c0000000};
15108 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15109 uint64_t expected_z9[] = {0x3f80000040000000, 0x41000000c1000000};
15110 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15111 uint64_t expected_z10[] = {0x3f80000040000000, 0x4100000040800000};
15112 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15113 }
15114}
15115
15116TEST_SVE(sve_fp_arith_pred_d) {
15117 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15118 START();
15119
15120 uint64_t inputs[] = {0x4000000000000000, 0x3ff0000000000000};
15121 uint64_t inputs_fmulx[] = {0x7ff0000000000000, 0x8000000000000000};
15122 uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x4100000000000000};
15123
15124 BasicFPArithHelper(&masm, kDRegSize, inputs, inputs_fmulx, inputs_nans);
15125
15126 END();
15127
15128 if (CAN_RUN()) {
15129 RUN();
15130 uint64_t expected_z2[] = {0x4008000000000000, 0x4008000000000000};
15131 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15132 uint64_t expected_z3[] = {0x3ff0000000000000, 0xbff0000000000000};
15133 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15134 uint64_t expected_z4[] = {0xbff0000000000000, 0x3ff0000000000000};
15135 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15136 uint64_t expected_z5[] = {0x3ff0000000000000, 0x3ff0000000000000};
15137 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15138 uint64_t expected_z6[] = {0x4000000000000000, 0x4000000000000000};
15139 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15140 uint64_t expected_z7[] = {0xc000000000000000, 0xc000000000000000};
15141 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15142 uint64_t expected_z9[] = {0x3ff0000000000000, 0x4000000000000000};
15143 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15144 uint64_t expected_z10[] = {0x3ff0000000000000, 0x4100000000000000};
15145 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15146 }
15147}
15148
Martyn Capewella2fadc22020-01-16 16:09:55 +000015149TEST_SVE(sve_fp_arith_pred_imm) {
15150 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15151 START();
15152
15153 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
15154 Initialise(&masm, p0.VnB(), pred);
15155 PRegisterM p0m = p0.Merging();
15156 __ Ptrue(p1.VnB());
15157
15158 __ Fdup(z0.VnD(), 0.0);
15159
15160 __ Mov(z1, z0);
15161 __ Fdiv(z1.VnH(), p1.Merging(), z1.VnH(), z1.VnH());
15162 __ Mov(z2, z0);
15163 __ Fadd(z2.VnH(), p0m, z2.VnH(), 0.5);
15164 __ Mov(z3, z2);
15165 __ Fsub(z3.VnH(), p0m, z3.VnH(), 1.0);
15166 __ Mov(z4, z3);
15167 __ Fsub(z4.VnH(), p0m, 1.0, z4.VnH());
15168 __ Mov(z5, z4);
15169 __ Fmul(z5.VnH(), p0m, z5.VnH(), 2.0);
15170 __ Mov(z6, z1);
15171 __ Fminnm(z6.VnH(), p0m, z6.VnH(), 0.0);
15172 __ Mov(z7, z1);
15173 __ Fmaxnm(z7.VnH(), p0m, z7.VnH(), 1.0);
15174 __ Mov(z8, z5);
15175 __ Fmin(z8.VnH(), p0m, z8.VnH(), 1.0);
15176 __ Mov(z9, z5);
15177 __ Fmax(z9.VnH(), p0m, z9.VnH(), 0.0);
15178
15179 __ Mov(z11, z0);
15180 __ Fdiv(z11.VnS(), p1.Merging(), z11.VnS(), z11.VnS());
15181 __ Mov(z12, z0);
15182 __ Fadd(z12.VnS(), p0m, z12.VnS(), 0.5);
15183 __ Mov(z13, z12);
15184 __ Fsub(z13.VnS(), p0m, z13.VnS(), 1.0);
15185 __ Mov(z14, z13);
15186 __ Fsub(z14.VnS(), p0m, 1.0, z14.VnS());
15187 __ Mov(z15, z14);
15188 __ Fmul(z15.VnS(), p0m, z15.VnS(), 2.0);
15189 __ Mov(z16, z11);
15190 __ Fminnm(z16.VnS(), p0m, z16.VnS(), 0.0);
15191 __ Mov(z17, z11);
15192 __ Fmaxnm(z17.VnS(), p0m, z17.VnS(), 1.0);
15193 __ Mov(z18, z15);
15194 __ Fmin(z18.VnS(), p0m, z18.VnS(), 1.0);
15195 __ Mov(z19, z15);
15196 __ Fmax(z19.VnS(), p0m, z19.VnS(), 0.0);
15197
15198 __ Mov(z21, z0);
15199 __ Fdiv(z21.VnD(), p1.Merging(), z21.VnD(), z21.VnD());
15200 __ Mov(z22, z0);
15201 __ Fadd(z22.VnD(), p0m, z22.VnD(), 0.5);
15202 __ Mov(z23, z22);
15203 __ Fsub(z23.VnD(), p0m, z23.VnD(), 1.0);
15204 __ Mov(z24, z23);
15205 __ Fsub(z24.VnD(), p0m, 1.0, z24.VnD());
15206 __ Mov(z25, z24);
15207 __ Fmul(z25.VnD(), p0m, z25.VnD(), 2.0);
15208 __ Mov(z26, z21);
15209 __ Fminnm(z26.VnD(), p0m, z26.VnD(), 0.0);
15210 __ Mov(z27, z21);
15211 __ Fmaxnm(z27.VnD(), p0m, z27.VnD(), 1.0);
15212 __ Mov(z28, z25);
15213 __ Fmin(z28.VnD(), p0m, z28.VnD(), 1.0);
15214 __ Mov(z29, z25);
15215 __ Fmax(z29.VnD(), p0m, z29.VnD(), 0.0);
15216
15217 __ Index(z0.VnH(), -3, 1);
15218 __ Scvtf(z0.VnH(), p1.Merging(), z0.VnH());
15219 __ Fmax(z0.VnH(), p1.Merging(), z0.VnH(), 0.0);
15220 __ Index(z1.VnS(), -4, 2);
15221 __ Scvtf(z1.VnS(), p1.Merging(), z1.VnS());
15222 __ Fadd(z1.VnS(), p1.Merging(), z1.VnS(), 1.0);
15223
15224 END();
15225
15226 if (CAN_RUN()) {
15227 RUN();
15228 uint64_t expected_z2[] = {0x3800380038003800, 0x3800000038003800};
15229 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15230 uint64_t expected_z3[] = {0xb800b800b800b800, 0xb8000000b800b800};
15231 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15232 uint64_t expected_z4[] = {0x3e003e003e003e00, 0x3e0000003e003e00};
15233 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15234 uint64_t expected_z5[] = {0x4200420042004200, 0x4200000042004200};
15235 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15236 uint64_t expected_z6[] = {0x0000000000000000, 0x00007e0000000000};
15237 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15238 uint64_t expected_z7[] = {0x3c003c003c003c00, 0x3c007e003c003c00};
15239 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15240 uint64_t expected_z8[] = {0x3c003c003c003c00, 0x3c0000003c003c00};
15241 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15242 uint64_t expected_z9[] = {0x4200420042004200, 0x4200000042004200};
15243 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15244
15245 uint64_t expected_z12[] = {0x3f0000003f000000, 0x000000003f000000};
15246 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
15247 uint64_t expected_z13[] = {0xbf000000bf000000, 0x00000000bf000000};
15248 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
15249 uint64_t expected_z14[] = {0x3fc000003fc00000, 0x000000003fc00000};
15250 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
15251 uint64_t expected_z15[] = {0x4040000040400000, 0x0000000040400000};
15252 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
15253 uint64_t expected_z16[] = {0x0000000000000000, 0x7fc0000000000000};
15254 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
15255 uint64_t expected_z17[] = {0x3f8000003f800000, 0x7fc000003f800000};
15256 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
15257 uint64_t expected_z18[] = {0x3f8000003f800000, 0x000000003f800000};
15258 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
15259 uint64_t expected_z19[] = {0x4040000040400000, 0x0000000040400000};
15260 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
15261
15262 uint64_t expected_z22[] = {0x3fe0000000000000, 0x3fe0000000000000};
15263 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
15264 uint64_t expected_z23[] = {0xbfe0000000000000, 0xbfe0000000000000};
15265 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
15266 uint64_t expected_z24[] = {0x3ff8000000000000, 0x3ff8000000000000};
15267 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
15268 uint64_t expected_z25[] = {0x4008000000000000, 0x4008000000000000};
15269 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
15270 uint64_t expected_z26[] = {0x0000000000000000, 0x0000000000000000};
15271 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
15272 uint64_t expected_z27[] = {0x3ff0000000000000, 0x3ff0000000000000};
15273 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
15274 uint64_t expected_z28[] = {0x3ff0000000000000, 0x3ff0000000000000};
15275 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
15276 uint64_t expected_z29[] = {0x4008000000000000, 0x4008000000000000};
15277 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
15278 uint64_t expected_z0[] = {0x4400420040003c00, 0x0000000000000000};
15279 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
15280 uint64_t expected_z1[] = {0x404000003f800000, 0xbf800000c0400000};
15281 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
15282 }
15283}
15284
Martyn Capewell37f28182020-01-14 10:15:10 +000015285TEST_SVE(sve_fscale) {
15286 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15287 START();
15288
15289 uint64_t inputs_h[] = {0x4800470046004500, 0x4400420040003c00};
15290 InsrHelper(&masm, z0.VnD(), inputs_h);
15291 uint64_t inputs_s[] = {0x4080000040400000, 0x400000003f800000};
15292 InsrHelper(&masm, z1.VnD(), inputs_s);
15293 uint64_t inputs_d[] = {0x40f0000000000000, 0x4000000000000000};
15294 InsrHelper(&masm, z2.VnD(), inputs_d);
15295
15296 uint64_t scales[] = {0x00080002fff8fffe, 0x00100001fff0ffff};
15297 InsrHelper(&masm, z3.VnD(), scales);
15298
15299 __ Ptrue(p0.VnB());
15300 int pred[] = {0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1};
15301 Initialise(&masm, p1.VnB(), pred);
15302
15303 __ Mov(z4, z0);
15304 __ Fscale(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH());
15305 __ Mov(z5, z0);
15306 __ Fscale(z5.VnH(), p1.Merging(), z5.VnH(), z3.VnH());
15307
15308 __ Sunpklo(z3.VnS(), z3.VnH());
15309 __ Mov(z6, z1);
15310 __ Fscale(z6.VnS(), p0.Merging(), z6.VnS(), z3.VnS());
15311 __ Mov(z7, z1);
15312 __ Fscale(z7.VnS(), p1.Merging(), z7.VnS(), z3.VnS());
15313
15314 __ Sunpklo(z3.VnD(), z3.VnS());
15315 __ Mov(z8, z2);
15316 __ Fscale(z8.VnD(), p0.Merging(), z8.VnD(), z3.VnD());
15317 __ Mov(z9, z2);
15318 __ Fscale(z9.VnD(), p1.Merging(), z9.VnD(), z3.VnD());
15319
15320 // Test full double precision range scaling.
15321 __ Dup(z10.VnD(), 2045);
15322 __ Dup(z11.VnD(), 0x0010000000000000); // 2^-1022
15323 __ Fscale(z11.VnD(), p0.Merging(), z11.VnD(), z10.VnD());
15324
15325 END();
15326
15327 if (CAN_RUN()) {
15328 RUN();
15329
15330 uint64_t expected_z4[] = {0x68004f0026003d00, 0x7c00460002003800};
15331 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15332 uint64_t expected_z5[] = {0x68004f0026004500, 0x7c00420002003800};
15333 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15334
15335 uint64_t expected_z6[] = {0x4880000040c00000, 0x380000003f000000};
15336 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15337 uint64_t expected_z7[] = {0x4880000040400000, 0x400000003f000000};
15338 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15339
15340 uint64_t expected_z8[] = {0x3ff0000000000000, 0x3ff0000000000000};
15341 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15342 uint64_t expected_z9[] = {0x40f0000000000000, 0x3ff0000000000000};
15343 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15344
15345 uint64_t expected_z11[] = {0x7fe0000000000000, 0x7fe0000000000000};
15346 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
15347 }
15348}
15349
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015350typedef void (MacroAssembler::*FcvtFrintMFn)(const ZRegister& zd,
15351 const PRegisterM& pg,
15352 const ZRegister& zn);
15353
15354typedef void (MacroAssembler::*FcvtFrintZFn)(const ZRegister& zd,
15355 const PRegisterZ& pg,
15356 const ZRegister& zn);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015357
15358template <typename F, size_t N>
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015359static void TestFcvtFrintHelper(Test* config,
15360 FcvtFrintMFn macro_m,
15361 FcvtFrintZFn macro_z,
15362 int dst_type_size_in_bits,
15363 int src_type_size_in_bits,
15364 const F (&zn_inputs)[N],
15365 const int (&pg_inputs)[N],
15366 const uint64_t (&zd_expected_all_active)[N]) {
15367 VIXL_ASSERT(macro_m != NULL);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015368 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15369 START();
15370
15371 // If the input and result types have a different size, the instruction
15372 // options on elements of the largest specified type is determined by the
15373 // larger type.
15374 int lane_size_in_bits =
15375 std::max(dst_type_size_in_bits, src_type_size_in_bits);
15376
15377 ZRegister zd_all_active = z25;
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015378 ZRegister zd_merging = z26;
TatWai Chongdb7437c2020-01-09 17:44:10 -080015379 ZRegister zn = z27;
15380
15381 uint64_t zn_rawbits[N];
15382 FPToRawbitsWithSize(zn_inputs, zn_rawbits, src_type_size_in_bits);
15383 InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_rawbits);
15384
15385 PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15386 __ Ptrue(pg_all_active);
15387
15388 // Test floating-point conversions with all lanes actived.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015389 (masm.*macro_m)(zd_all_active.WithLaneSize(dst_type_size_in_bits),
15390 pg_all_active.Merging(),
15391 zn.WithLaneSize(src_type_size_in_bits));
TatWai Chongdb7437c2020-01-09 17:44:10 -080015392
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015393 PRegisterWithLaneSize pg_merging = p1.WithLaneSize(lane_size_in_bits);
15394 Initialise(&masm, pg_merging, pg_inputs);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015395
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015396 __ Dup(zd_merging.VnD(), 0x0bad0bad0bad0bad);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015397
15398 // Use the same `zn` inputs to test floating-point conversions but partial
15399 // lanes are set inactive.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015400 (masm.*macro_m)(zd_merging.WithLaneSize(dst_type_size_in_bits),
15401 pg_merging.Merging(),
15402 zn.WithLaneSize(src_type_size_in_bits));
15403
15404 ZRegister zd_zeroing = z24;
15405 PRegisterWithLaneSize pg_zeroing = p1.WithLaneSize(lane_size_in_bits);
15406 Initialise(&masm, pg_zeroing, pg_inputs);
15407
15408 if (macro_z != NULL) {
15409 __ Dup(zd_zeroing.VnD(), 0x0bad0bad0bad0bad);
15410 (masm.*macro_z)(zd_zeroing.WithLaneSize(dst_type_size_in_bits),
15411 pg_zeroing.Zeroing(),
15412 zn.WithLaneSize(src_type_size_in_bits));
15413 }
TatWai Chongdb7437c2020-01-09 17:44:10 -080015414
15415 END();
15416
15417 if (CAN_RUN()) {
15418 RUN();
15419
15420 ASSERT_EQUAL_SVE(zd_expected_all_active,
15421 zd_all_active.WithLaneSize(lane_size_in_bits));
15422
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015423 uint64_t zd_expected_merging[N];
TatWai Chongdb7437c2020-01-09 17:44:10 -080015424 for (unsigned i = 0; i < N; i++) {
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015425 zd_expected_merging[i] =
TatWai Chongdb7437c2020-01-09 17:44:10 -080015426 pg_inputs[i] ? zd_expected_all_active[i]
15427 : 0x0bad0bad0bad0bad & GetUintMask(lane_size_in_bits);
15428 }
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015429 ASSERT_EQUAL_SVE(zd_expected_merging,
15430 zd_merging.WithLaneSize(lane_size_in_bits));
15431
15432 if (macro_z != NULL) {
15433 uint64_t zd_expected_zeroing[N] = {0};
15434 for (unsigned i = 0; i < N; i++) {
15435 if (pg_inputs[i]) {
15436 zd_expected_zeroing[i] = zd_expected_all_active[i];
15437 }
15438 }
15439 ASSERT_EQUAL_SVE(zd_expected_zeroing,
15440 zd_zeroing.WithLaneSize(lane_size_in_bits));
15441 }
TatWai Chongdb7437c2020-01-09 17:44:10 -080015442 }
15443}
15444
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015445template <typename F, size_t N>
15446static void TestFcvtzHelper(Test* config,
15447 FcvtFrintMFn macro_m,
15448 int dst_type_size_in_bits,
15449 int src_type_size_in_bits,
15450 const F (&zn_inputs)[N],
15451 const int (&pg_inputs)[N],
15452 const uint64_t (&zd_expected_all_active)[N]) {
15453 TestFcvtFrintHelper(config,
15454 macro_m,
15455 // Fcvt variants have no zeroing predication form.
15456 NULL,
15457 dst_type_size_in_bits,
15458 src_type_size_in_bits,
15459 zn_inputs,
15460 pg_inputs,
15461 zd_expected_all_active);
15462}
15463
TatWai Chongdb7437c2020-01-09 17:44:10 -080015464TEST_SVE(fcvtzs_fcvtzu_float16) {
TatWai Chongdb7437c2020-01-09 17:44:10 -080015465 const double h_max_float16 = kHMaxInt; // Largest float16 == INT16_MAX.
15466 const double h_min_float16 = -h_max_float16; // Smallest float16 > INT16_MIN.
15467 const double largest_float16 = 0xffe0; // 65504
15468 const double smallest_float16 = -largest_float16;
15469 const double h_max_int_sub_one = kHMaxInt - 1;
15470 const double h_min_int_add_one = kHMinInt + 1;
15471
15472 double zn_inputs[] = {1.0,
15473 1.1,
15474 1.5,
15475 -1.5,
15476 h_max_float16,
15477 h_min_float16,
15478 largest_float16,
15479 smallest_float16,
15480 kFP64PositiveInfinity,
15481 kFP64NegativeInfinity,
15482 h_max_int_sub_one,
15483 h_min_int_add_one};
15484
15485 int pg_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
15486
15487 uint64_t expected_fcvtzs_fp162h[] = {1,
15488 1,
15489 1,
15490 0xffff,
15491 0x7fff,
15492 0x8000,
15493 0x7fff,
15494 0x8000,
15495 0x7fff,
15496 0x8000,
15497 0x7fff,
15498 0x8000};
15499
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015500 uint64_t expected_fcvtzu_fp162h[] =
15501 {1, 1, 1, 0, 0x8000, 0, 0xffe0, 0, 0xffff, 0, 0x8000, 0};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015502
15503 // Float16 to 16-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015504 TestFcvtzHelper(config,
15505 &MacroAssembler::Fcvtzs,
15506 kHRegSize,
15507 kHRegSize,
15508 zn_inputs,
15509 pg_inputs,
15510 expected_fcvtzs_fp162h);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015511
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015512 TestFcvtzHelper(config,
15513 &MacroAssembler::Fcvtzu,
15514 kHRegSize,
15515 kHRegSize,
15516 zn_inputs,
15517 pg_inputs,
15518 expected_fcvtzu_fp162h);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015519
15520 uint64_t expected_fcvtzs_fp162w[] = {1,
15521 1,
15522 1,
15523 0xffffffff,
15524 0x8000,
15525 0xffff8000,
15526 0xffe0,
15527 0xffff0020,
15528 0x7fffffff,
15529 0x80000000,
15530 0x8000,
15531 0xffff8000};
15532
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015533 uint64_t expected_fcvtzu_fp162w[] =
15534 {1, 1, 1, 0, 0x8000, 0, 0xffe0, 0, 0xffffffff, 0, 0x8000, 0};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015535
15536 // Float16 to 32-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015537 TestFcvtzHelper(config,
15538 &MacroAssembler::Fcvtzs,
15539 kSRegSize,
15540 kHRegSize,
15541 zn_inputs,
15542 pg_inputs,
15543 expected_fcvtzs_fp162w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015544
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015545 TestFcvtzHelper(config,
15546 &MacroAssembler::Fcvtzu,
15547 kSRegSize,
15548 kHRegSize,
15549 zn_inputs,
15550 pg_inputs,
15551 expected_fcvtzu_fp162w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015552
15553 uint64_t expected_fcvtzs_fp162x[] = {1,
15554 1,
15555 1,
15556 0xffffffffffffffff,
15557 0x8000,
15558 0xffffffffffff8000,
15559 0xffe0,
15560 0xffffffffffff0020,
15561 0x7fffffffffffffff,
15562 0x8000000000000000,
15563 0x8000,
15564 0xffffffffffff8000};
15565
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015566 uint64_t expected_fcvtzu_fp162x[] =
15567 {1, 1, 1, 0, 0x8000, 0, 0xffe0, 0, 0xffffffffffffffff, 0, 0x8000, 0};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015568
15569 // Float16 to 64-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015570 TestFcvtzHelper(config,
15571 &MacroAssembler::Fcvtzs,
15572 kDRegSize,
15573 kHRegSize,
15574 zn_inputs,
15575 pg_inputs,
15576 expected_fcvtzs_fp162x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015577
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015578 TestFcvtzHelper(config,
15579 &MacroAssembler::Fcvtzu,
15580 kDRegSize,
15581 kHRegSize,
15582 zn_inputs,
15583 pg_inputs,
15584 expected_fcvtzu_fp162x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015585}
15586
15587TEST_SVE(fcvtzs_fcvtzu_float) {
15588 const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
15589 const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
15590 const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
15591 const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
15592 const double w_max_int_sub_one = kWMaxInt - 1;
15593 const double w_min_int_add_one = kWMinInt + 1;
15594 const double x_max_int_sub_one = kXMaxInt - 1;
15595 const double x_min_int_add_one = kXMinInt + 1;
15596
TatWai Chongdb7437c2020-01-09 17:44:10 -080015597 double zn_inputs[] = {1.0,
15598 1.1,
15599 1.5,
15600 -1.5,
15601 w_max_float,
15602 w_min_float,
15603 x_max_float,
15604 x_min_float,
15605 kFP64PositiveInfinity,
15606 kFP64NegativeInfinity,
15607 w_max_int_sub_one,
15608 w_min_int_add_one,
15609 x_max_int_sub_one,
15610 x_min_int_add_one};
15611
15612 int pg_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0};
15613
15614 uint64_t expected_fcvtzs_s2w[] = {1,
15615 1,
15616 1,
15617 0xffffffff,
15618 0x7fffff80,
15619 0x80000080,
15620 0x7fffffff,
15621 0x80000000,
15622 0x7fffffff,
15623 0x80000000,
15624 0x7fffffff,
15625 0x80000000,
15626 0x7fffffff,
15627 0x80000000};
15628
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015629 uint64_t expected_fcvtzu_s2w[] = {1,
15630 1,
15631 1,
15632 0,
15633 0x7fffff80,
15634 0,
15635 0xffffffff,
15636 0,
15637 0xffffffff,
15638 0,
15639 0x80000000,
15640 0,
15641 0xffffffff,
15642 0};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015643
15644 // Float to 32-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015645 TestFcvtzHelper(config,
15646 &MacroAssembler::Fcvtzs,
15647 kSRegSize,
15648 kSRegSize,
15649 zn_inputs,
15650 pg_inputs,
15651 expected_fcvtzs_s2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015652
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015653 TestFcvtzHelper(config,
15654 &MacroAssembler::Fcvtzu,
15655 kSRegSize,
15656 kSRegSize,
15657 zn_inputs,
15658 pg_inputs,
15659 expected_fcvtzu_s2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015660
15661 uint64_t expected_fcvtzs_s2x[] = {1,
15662 1,
15663 1,
15664 0xffffffffffffffff,
15665 0x7fffff80,
15666 0xffffffff80000080,
15667 0x7fffff8000000000,
15668 0x8000008000000000,
15669 0x7fffffffffffffff,
15670 0x8000000000000000,
15671 0x80000000,
15672 0xffffffff80000000,
15673 0x7fffffffffffffff,
15674 0x8000000000000000};
15675
15676 uint64_t expected_fcvtzu_s2x[] = {1,
15677 1,
15678 1,
15679 0,
15680 0x7fffff80,
15681 0,
15682 0x7fffff8000000000,
15683 0,
15684 0xffffffffffffffff,
15685 0,
15686 0x0000000080000000,
15687 0,
15688 0x8000000000000000,
15689 0};
15690
15691 // Float to 64-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015692 TestFcvtzHelper(config,
15693 &MacroAssembler::Fcvtzs,
15694 kDRegSize,
15695 kSRegSize,
15696 zn_inputs,
15697 pg_inputs,
15698 expected_fcvtzs_s2x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015699
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015700 TestFcvtzHelper(config,
15701 &MacroAssembler::Fcvtzu,
15702 kDRegSize,
15703 kSRegSize,
15704 zn_inputs,
15705 pg_inputs,
15706 expected_fcvtzu_s2x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015707}
15708
15709TEST_SVE(fcvtzs_fcvtzu_double) {
TatWai Chongdb7437c2020-01-09 17:44:10 -080015710 const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
15711 const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
15712 const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
15713 const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015714 const double w_max_double = kWMaxInt; // Largest double == INT32_MAX.
15715 const double w_min_double = -w_max_double; // Smallest double > INT32_MIN.
15716 const double x_max_double =
15717 0x7ffffffffffffc00; // Largest double < INT64_MAX.
15718 const double x_min_double = -x_max_double; // Smallest double > INT64_MIN.
TatWai Chongdb7437c2020-01-09 17:44:10 -080015719 const double w_max_int_sub_one = kWMaxInt - 1;
15720 const double w_min_int_add_one = kWMinInt + 1;
15721 const double x_max_int_sub_one = kXMaxInt - 1;
15722 const double x_min_int_add_one = kXMinInt + 1;
15723
15724 double zn_inputs[] = {1.0,
15725 1.1,
15726 1.5,
15727 -1.5,
15728 w_max_float,
15729 w_min_float,
15730 x_max_float,
15731 x_min_float,
15732 w_max_double,
15733 w_min_double,
15734 x_max_double,
15735 x_min_double,
15736 kFP64PositiveInfinity,
15737 kFP64NegativeInfinity,
15738 w_max_int_sub_one,
15739 w_min_int_add_one,
15740 x_max_int_sub_one,
15741 x_min_int_add_one};
15742
15743 int pg_inputs[] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0};
15744
15745 uint64_t expected_fcvtzs_d2w[] = {1,
15746 1,
15747 1,
15748 0xffffffffffffffff,
15749 0x7fffff80,
15750 0xffffffff80000080,
15751 0x7fffffff,
15752 0xffffffff80000000,
15753 0x7fffffff,
15754 0xffffffff80000001,
15755 0x7fffffff,
15756 0xffffffff80000000,
15757 0x7fffffff,
15758 0xffffffff80000000,
15759 0x7ffffffe,
15760 0xffffffff80000001,
15761 0x7fffffff,
15762 0xffffffff80000000};
15763
15764 uint64_t expected_fcvtzu_d2w[] = {1,
15765 1,
15766 1,
15767 0,
15768 0x7fffff80,
15769 0,
15770 0xffffffff,
15771 0,
15772 0x7fffffff,
15773 0,
15774 0xffffffff,
15775 0,
15776 0xffffffff,
15777 0,
15778 0x7ffffffe,
15779 0,
15780 0xffffffff,
15781 0};
15782
15783 // Double to 32-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015784 TestFcvtzHelper(config,
15785 &MacroAssembler::Fcvtzs,
15786 kSRegSize,
15787 kDRegSize,
15788 zn_inputs,
15789 pg_inputs,
15790 expected_fcvtzs_d2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015791
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015792 TestFcvtzHelper(config,
15793 &MacroAssembler::Fcvtzu,
15794 kSRegSize,
15795 kDRegSize,
15796 zn_inputs,
15797 pg_inputs,
15798 expected_fcvtzu_d2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015799
15800 uint64_t expected_fcvtzs_d2x[] = {1,
15801 1,
15802 1,
15803 0xffffffffffffffff,
15804 0x7fffff80,
15805 0xffffffff80000080,
15806 0x7fffff8000000000,
15807 0x8000008000000000,
15808 0x7fffffff,
15809 0xffffffff80000001,
15810 0x7ffffffffffffc00,
15811 0x8000000000000400,
15812 0x7fffffffffffffff,
15813 0x8000000000000000,
15814 0x7ffffffe,
15815 0xffffffff80000001,
15816 0x7fffffffffffffff,
15817 0x8000000000000000};
15818
15819 uint64_t expected_fcvtzu_d2x[] = {1,
15820 1,
15821 1,
15822 0,
15823 0x7fffff80,
15824 0,
15825 0x7fffff8000000000,
15826 0,
15827 0x7fffffff,
15828 0,
15829 0x7ffffffffffffc00,
15830 0,
15831 0xffffffffffffffff,
15832 0,
15833 0x000000007ffffffe,
15834 0,
15835 0x8000000000000000,
15836 0};
15837
15838 // Double to 64-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015839 TestFcvtzHelper(config,
15840 &MacroAssembler::Fcvtzs,
15841 kDRegSize,
15842 kDRegSize,
15843 zn_inputs,
15844 pg_inputs,
15845 expected_fcvtzs_d2x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015846
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015847 TestFcvtzHelper(config,
15848 &MacroAssembler::Fcvtzu,
15849 kDRegSize,
15850 kDRegSize,
15851 zn_inputs,
15852 pg_inputs,
15853 expected_fcvtzu_d2x);
15854}
15855
15856template <typename F, size_t N>
15857static void TestFrintHelper(Test* config,
15858 FcvtFrintMFn macro_m,
15859 FcvtFrintZFn macro_z,
15860 int lane_size_in_bits,
15861 const F (&zn_inputs)[N],
15862 const int (&pg_inputs)[N],
15863 const F (&zd_expected)[N]) {
15864 uint64_t zd_expected_rawbits[N];
15865 FPToRawbitsWithSize(zd_expected, zd_expected_rawbits, lane_size_in_bits);
15866 TestFcvtFrintHelper(config,
15867 macro_m,
15868 macro_z,
15869 lane_size_in_bits,
15870 lane_size_in_bits,
15871 zn_inputs,
15872 pg_inputs,
15873 zd_expected_rawbits);
15874}
15875
15876TEST_SVE(frint) {
15877 const double inf_pos = kFP64PositiveInfinity;
15878 const double inf_neg = kFP64NegativeInfinity;
15879
15880 double zn_inputs[] =
15881 {1.1, 1.5, 1.9, 2.5, -1.5, -2.5, 0.0, -0.0, -0.2, inf_pos, inf_neg};
15882 double zd_expected_a[] =
15883 {1.0, 2.0, 2.0, 3.0, -2.0, -3.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15884 double zd_expected_i[] =
15885 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15886 double zd_expected_m[] =
15887 {1.0, 1.0, 1.0, 2.0, -2.0, -3.0, 0.0, -0.0, -1.0, inf_pos, inf_neg};
15888 double zd_expected_n[] =
15889 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15890 double zd_expected_p[] =
15891 {2.0, 2.0, 2.0, 3.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15892 double zd_expected_x[] =
15893 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15894 double zd_expected_z[] =
15895 {1.0, 1.0, 1.0, 2.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15896
15897 int pg_inputs[] = {0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0};
15898
15899 struct TestDataSet {
15900 FcvtFrintMFn macro_m; // merging form.
15901 FcvtFrintZFn macro_z; // zeroing form.
15902 double (&expected)[11];
15903 };
15904
15905 TestDataSet test_data[] =
15906 {{&MacroAssembler::Frinta, &MacroAssembler::Frinta, zd_expected_a},
15907 {&MacroAssembler::Frinti, &MacroAssembler::Frinti, zd_expected_i},
15908 {&MacroAssembler::Frintm, &MacroAssembler::Frintm, zd_expected_m},
15909 {&MacroAssembler::Frintn, &MacroAssembler::Frintn, zd_expected_n},
15910 {&MacroAssembler::Frintp, &MacroAssembler::Frintp, zd_expected_p},
15911 {&MacroAssembler::Frintx, &MacroAssembler::Frintx, zd_expected_x},
15912 {&MacroAssembler::Frintz, &MacroAssembler::Frintz, zd_expected_z}};
15913
15914 unsigned lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
15915
15916 for (size_t i = 0; i < sizeof(test_data) / sizeof(TestDataSet); i++) {
15917 for (size_t j = 0; j < ArrayLength(lane_sizes); j++) {
15918 TestFrintHelper(config,
15919 test_data[i].macro_m,
15920 test_data[i].macro_z,
15921 lane_sizes[j],
15922 zn_inputs,
15923 pg_inputs,
15924 test_data[i].expected);
15925 }
15926 }
TatWai Chongdb7437c2020-01-09 17:44:10 -080015927}
15928
TatWai Chong31cd6a02020-01-10 13:03:26 -080015929struct CvtfTestDataSet {
15930 uint64_t int_value;
15931 uint64_t scvtf_result;
15932 uint64_t ucvtf_result;
15933};
15934
15935template <size_t N>
15936static void TestUScvtfHelper(Test* config,
15937 int dst_type_size_in_bits,
15938 int src_type_size_in_bits,
15939 const int (&pg_inputs)[N],
15940 const CvtfTestDataSet (&data_set)[N]) {
15941 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15942 START();
15943
15944 // Unpack the data from the array of struct into individual arrays that can
15945 // simplify the testing.
15946 uint64_t zn_inputs[N];
15947 uint64_t expected_zd_scvtf_all_active[N];
15948 uint64_t expected_zd_ucvtf_all_active[N];
15949 for (size_t i = 0; i < N; i++) {
15950 zn_inputs[i] = data_set[i].int_value;
15951 expected_zd_scvtf_all_active[i] = data_set[i].scvtf_result;
15952 expected_zd_ucvtf_all_active[i] = data_set[i].ucvtf_result;
15953 }
15954
15955 // If the input and result types have a different size, the instruction
15956 // operates on elements of the largest specified type.
15957 int lane_size_in_bits =
15958 std::max(dst_type_size_in_bits, src_type_size_in_bits);
15959
15960 ZRegister zd_scvtf_all_active = z25;
15961 ZRegister zd_ucvtf_all_active = z26;
15962 ZRegister zn = z27;
15963 InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_inputs);
15964
15965 PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15966 __ Ptrue(pg_all_active);
15967
15968 // Test integer conversions with all lanes actived.
15969 __ Scvtf(zd_scvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15970 pg_all_active.Merging(),
15971 zn.WithLaneSize(src_type_size_in_bits));
15972 __ Ucvtf(zd_ucvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15973 pg_all_active.Merging(),
15974 zn.WithLaneSize(src_type_size_in_bits));
15975
15976 ZRegister zd_scvtf_merged = z23;
15977 ZRegister zd_ucvtf_merged = z24;
15978
15979 PRegisterWithLaneSize pg_merged = p1.WithLaneSize(lane_size_in_bits);
15980 Initialise(&masm, pg_merged, pg_inputs);
15981
15982 uint64_t snan;
15983 switch (lane_size_in_bits) {
15984 case kHRegSize:
15985 snan = 0x7c11;
15986 break;
15987 case kSRegSize:
15988 snan = 0x7f951111;
15989 break;
15990 case kDRegSize:
15991 snan = 0x7ff5555511111111;
15992 break;
15993 }
15994 __ Dup(zd_scvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15995 __ Dup(zd_ucvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15996
15997 // Use the same `zn` inputs to test integer conversions but some lanes are set
15998 // inactive.
15999 __ Scvtf(zd_scvtf_merged.WithLaneSize(dst_type_size_in_bits),
16000 pg_merged.Merging(),
16001 zn.WithLaneSize(src_type_size_in_bits));
16002 __ Ucvtf(zd_ucvtf_merged.WithLaneSize(dst_type_size_in_bits),
16003 pg_merged.Merging(),
16004 zn.WithLaneSize(src_type_size_in_bits));
16005
16006 END();
16007
16008 if (CAN_RUN()) {
16009 RUN();
16010
16011 ASSERT_EQUAL_SVE(expected_zd_scvtf_all_active,
16012 zd_scvtf_all_active.WithLaneSize(lane_size_in_bits));
16013 ASSERT_EQUAL_SVE(expected_zd_ucvtf_all_active,
16014 zd_ucvtf_all_active.WithLaneSize(lane_size_in_bits));
16015
16016 uint64_t expected_zd_scvtf_merged[N];
16017 for (size_t i = 0; i < N; i++) {
16018 expected_zd_scvtf_merged[i] =
16019 pg_inputs[i] ? expected_zd_scvtf_all_active[i] : snan;
16020 }
16021 ASSERT_EQUAL_SVE(expected_zd_scvtf_merged,
16022 zd_scvtf_merged.WithLaneSize(lane_size_in_bits));
16023
16024 uint64_t expected_zd_ucvtf_merged[N];
16025 for (size_t i = 0; i < N; i++) {
16026 expected_zd_ucvtf_merged[i] =
16027 pg_inputs[i] ? expected_zd_ucvtf_all_active[i] : snan;
16028 }
16029 ASSERT_EQUAL_SVE(expected_zd_ucvtf_merged,
16030 zd_ucvtf_merged.WithLaneSize(lane_size_in_bits));
16031 }
16032}
16033
16034TEST_SVE(scvtf_ucvtf_h_s_d_to_float16) {
16035 // clang-format off
16036 CvtfTestDataSet data_set_1[] = {
16037 // Simple conversions of positive numbers which require no rounding; the
16038 // results should not depened on the rounding mode, and ucvtf and scvtf should
16039 // produce the same result.
16040 {0x0000, 0x0000, 0x0000},
16041 {0x0001, 0x3c00, 0x3c00},
16042 {0x0010, 0x4c00, 0x4c00},
16043 {0x0080, 0x5800, 0x5800},
16044 {0x0400, 0x6400, 0x6400},
16045 // Conversions which require rounding.
16046 {0x4000, 0x7400, 0x7400},
16047 {0x4001, 0x7400, 0x7400},
16048 // Round up to produce a result that's too big for the input to represent.
16049 {0x7ff0, 0x77ff, 0x77ff},
16050 {0x7ff1, 0x77ff, 0x77ff},
16051 {0x7ffe, 0x7800, 0x7800},
16052 {0x7fff, 0x7800, 0x7800}};
16053 int pg_1[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
16054 TestUScvtfHelper(config, kHRegSize, kDRegSize, pg_1, data_set_1);
16055 TestUScvtfHelper(config, kHRegSize, kSRegSize, pg_1, data_set_1);
16056 TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_1, data_set_1);
16057
16058 CvtfTestDataSet data_set_2[] = {
16059 // Test mantissa extremities.
16060 {0x0401, 0x6401, 0x6401},
16061 {0x4020, 0x7402, 0x7402},
16062 // The largest int16_t that fits in a float16.
16063 {0xffef, 0xcc40, 0x7bff},
16064 // Values that would be negative if treated as an int16_t.
16065 {0xff00, 0xdc00, 0x7bf8},
16066 {0x8000, 0xf800, 0x7800},
16067 {0x8100, 0xf7f0, 0x7808},
16068 // Check for bit pattern reproduction.
16069 {0x0123, 0x5c8c, 0x5c8c},
16070 {0x0cde, 0x6a6f, 0x6a6f},
16071 // Simple conversions of negative int64_t values. These require no rounding,
16072 // and the results should not depend on the rounding mode.
16073 {0xf800, 0xe800, 0x7bc0},
16074 {0xfc00, 0xe400, 0x7be0},
16075 {0xc000, 0xf400, 0x7a00},
16076 // Check rounding of negative int16_t values.
16077 {0x8ffe, 0xf700, 0x7880},
16078 {0x8fff, 0xf700, 0x7880},
16079 {0xffee, 0xcc80, 0x7bff},
16080 {0xffef, 0xcc40, 0x7bff}};
16081 int pg_2[] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1};
16082 // `32-bit to float16` and `64-bit to float16` of above tests has been tested
16083 // in `ucvtf` of `16-bit to float16`.
16084 TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_2, data_set_2);
16085 // clang-format on
16086}
16087
16088TEST_SVE(scvtf_ucvtf_s_to_float) {
16089 // clang-format off
16090 int dst_lane_size = kSRegSize;
16091 int src_lane_size = kSRegSize;
16092
16093 // Simple conversions of positive numbers which require no rounding; the
16094 // results should not depened on the rounding mode, and ucvtf and scvtf should
16095 // produce the same result.
16096 CvtfTestDataSet data_set_1[] = {
16097 {0x00000000, 0x00000000, 0x00000000},
16098 {0x00000001, 0x3f800000, 0x3f800000},
16099 {0x00004000, 0x46800000, 0x46800000},
16100 {0x00010000, 0x47800000, 0x47800000},
16101 {0x40000000, 0x4e800000, 0x4e800000}};
16102 int pg_1[] = {1, 0, 1, 0, 0};
16103 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16104
16105 CvtfTestDataSet data_set_2[] = {
16106 // Test mantissa extremities.
16107 {0x00800001, 0x4b000001, 0x4b000001},
16108 {0x40400000, 0x4e808000, 0x4e808000},
16109 // The largest int32_t that fits in a double.
16110 {0x7fffff80, 0x4effffff, 0x4effffff},
16111 // Values that would be negative if treated as an int32_t.
16112 {0xffffffff, 0xbf800000, 0x4f800000},
16113 {0xffffff00, 0xc3800000, 0x4f7fffff},
16114 {0x80000000, 0xcf000000, 0x4f000000},
16115 {0x80000001, 0xcf000000, 0x4f000000},
16116 // Check for bit pattern reproduction.
16117 {0x089abcde, 0x4d09abce, 0x4d09abce},
16118 {0x12345678, 0x4d91a2b4, 0x4d91a2b4}};
16119 int pg_2[] = {1, 0, 1, 0, 1, 1, 1, 0, 0};
16120 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16121
16122 // Simple conversions of negative int32_t values. These require no rounding,
16123 // and the results should not depend on the rounding mode.
16124 CvtfTestDataSet data_set_3[] = {
16125 {0xffffc000, 0xc6800000, 0x4f7fffc0},
16126 {0xffff0000, 0xc7800000, 0x4f7fff00},
16127 {0xc0000000, 0xce800000, 0x4f400000},
16128 // Conversions which require rounding.
16129 {0x72800000, 0x4ee50000, 0x4ee50000},
16130 {0x72800001, 0x4ee50000, 0x4ee50000},
16131 {0x73000000, 0x4ee60000, 0x4ee60000},
16132 // Check rounding of negative int32_t values.
16133 {0x80000140, 0xcefffffe, 0x4f000001},
16134 {0x80000141, 0xcefffffd, 0x4f000001},
16135 {0x80000180, 0xcefffffd, 0x4f000002},
16136 // Round up to produce a result that's too big for the input to represent.
16137 {0x7fffffc0, 0x4f000000, 0x4f000000},
16138 {0x7fffffff, 0x4f000000, 0x4f000000}};
16139 int pg_3[] = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0};
16140 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16141 // clang-format on
16142}
16143
16144TEST_SVE(scvtf_ucvtf_d_to_float) {
16145 // clang-format off
16146 int dst_lane_size = kSRegSize;
16147 int src_lane_size = kDRegSize;
16148
16149 // Simple conversions of positive numbers which require no rounding; the
16150 // results should not depened on the rounding mode, and ucvtf and scvtf should
16151 // produce the same result.
16152 CvtfTestDataSet data_set_1[] = {
16153 {0x0000000000000000, 0x00000000, 0x00000000},
16154 {0x0000000000000001, 0x3f800000, 0x3f800000},
16155 {0x0000000040000000, 0x4e800000, 0x4e800000},
16156 {0x0000000100000000, 0x4f800000, 0x4f800000},
16157 {0x4000000000000000, 0x5e800000, 0x5e800000}};
16158 int pg_1[] = {1, 1, 0, 1, 0};
16159 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16160
16161 CvtfTestDataSet data_set_2[] = {
16162 // Test mantissa extremities.
16163 {0x0010000000000001, 0x59800000, 0x59800000},
16164 {0x4008000000000000, 0x5e801000, 0x5e801000},
16165 // The largest int32_t that fits in a float.
16166 {0x000000007fffff80, 0x4effffff, 0x4effffff},
16167 // Values that would be negative if treated as an int32_t.
16168 {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16169 {0x00000000ffffff00, 0x4f7fffff, 0x4f7fffff},
16170 {0x0000000080000000, 0x4f000000, 0x4f000000},
16171 {0x0000000080000100, 0x4f000001, 0x4f000001},
16172 // The largest int64_t that fits in a float.
16173 {0x7fffff8000000000, 0x5effffff, 0x5effffff},
16174 // Check for bit pattern reproduction.
16175 {0x0123456789abcde0, 0x5b91a2b4, 0x5b91a2b4},
16176 {0x0000000000876543, 0x4b076543, 0x4b076543}};
16177 int pg_2[] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1};
16178 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16179
16180 CvtfTestDataSet data_set_3[] = {
16181 // Simple conversions of negative int64_t values. These require no rounding,
16182 // and the results should not depend on the rounding mode.
16183 {0xffffffffc0000000, 0xce800000, 0x5f800000},
16184 {0xffffffff00000000, 0xcf800000, 0x5f800000},
16185 {0xc000000000000000, 0xde800000, 0x5f400000},
16186 // Conversions which require rounding.
16187 {0x0000800002800000, 0x57000002, 0x57000002},
16188 {0x0000800002800001, 0x57000003, 0x57000003},
16189 {0x0000800003000000, 0x57000003, 0x57000003},
16190 // Check rounding of negative int64_t values.
16191 {0x8000014000000000, 0xdefffffe, 0x5f000001},
16192 {0x8000014000000001, 0xdefffffd, 0x5f000001},
16193 {0x8000018000000000, 0xdefffffd, 0x5f000002},
16194 // Round up to produce a result that's too big for the input to represent.
16195 {0x00000000ffffff80, 0x4f800000, 0x4f800000},
16196 {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16197 {0xffffff8000000000, 0xd3000000, 0x5f800000},
16198 {0xffffffffffffffff, 0xbf800000, 0x5f800000}};
16199 int pg_3[] = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1};
16200 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16201 // clang-format on
16202}
16203
16204TEST_SVE(scvtf_ucvtf_d_to_double) {
16205 // clang-format off
16206 int dst_lane_size = kDRegSize;
16207 int src_lane_size = kDRegSize;
16208
16209 // Simple conversions of positive numbers which require no rounding; the
16210 // results should not depened on the rounding mode, and ucvtf and scvtf should
16211 // produce the same result.
16212 CvtfTestDataSet data_set_1[] = {
16213 {0x0000000000000000, 0x0000000000000000, 0x0000000000000000},
16214 {0x0000000000000001, 0x3ff0000000000000, 0x3ff0000000000000},
16215 {0x0000000040000000, 0x41d0000000000000, 0x41d0000000000000},
16216 {0x0000000100000000, 0x41f0000000000000, 0x41f0000000000000},
16217 {0x4000000000000000, 0x43d0000000000000, 0x43d0000000000000}};
16218 int pg_1[] = {0, 1, 1, 0, 0};
16219 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16220
16221 CvtfTestDataSet data_set_2[] = {
16222 // Test mantissa extremities.
16223 {0x0010000000000001, 0x4330000000000001, 0x4330000000000001},
16224 {0x4008000000000000, 0x43d0020000000000, 0x43d0020000000000},
16225 // The largest int32_t that fits in a double.
16226 {0x000000007fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16227 // Values that would be negative if treated as an int32_t.
16228 {0x00000000ffffffff, 0x41efffffffe00000, 0x41efffffffe00000},
16229 {0x0000000080000000, 0x41e0000000000000, 0x41e0000000000000},
16230 {0x0000000080000001, 0x41e0000000200000, 0x41e0000000200000},
16231 // The largest int64_t that fits in a double.
16232 {0x7ffffffffffffc00, 0x43dfffffffffffff, 0x43dfffffffffffff},
16233 // Check for bit pattern reproduction.
16234 {0x0123456789abcde0, 0x43723456789abcde, 0x43723456789abcde},
16235 {0x0000000012345678, 0x41b2345678000000, 0x41b2345678000000}};
16236 int pg_2[] = {1, 1, 1, 1, 1, 0, 0, 0, 0};
16237 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16238
16239 CvtfTestDataSet data_set_3[] = {
16240 // Simple conversions of negative int64_t values. These require no rounding,
16241 // and the results should not depend on the rounding mode.
16242 {0xffffffffc0000000, 0xc1d0000000000000, 0x43effffffff80000},
16243 {0xffffffff00000000, 0xc1f0000000000000, 0x43efffffffe00000},
16244 {0xc000000000000000, 0xc3d0000000000000, 0x43e8000000000000},
16245 // Conversions which require rounding.
16246 {0x1000000000000280, 0x43b0000000000002, 0x43b0000000000002},
16247 {0x1000000000000281, 0x43b0000000000003, 0x43b0000000000003},
16248 {0x1000000000000300, 0x43b0000000000003, 0x43b0000000000003},
16249 // Check rounding of negative int64_t values.
16250 {0x8000000000000a00, 0xc3dffffffffffffe, 0x43e0000000000001},
16251 {0x8000000000000a01, 0xc3dffffffffffffd, 0x43e0000000000001},
16252 {0x8000000000000c00, 0xc3dffffffffffffd, 0x43e0000000000002},
16253 // Round up to produce a result that's too big for the input to represent.
16254 {0x7ffffffffffffe00, 0x43e0000000000000, 0x43e0000000000000},
16255 {0x7fffffffffffffff, 0x43e0000000000000, 0x43e0000000000000},
16256 {0xfffffffffffffc00, 0xc090000000000000, 0x43f0000000000000},
16257 {0xffffffffffffffff, 0xbff0000000000000, 0x43f0000000000000}};
16258 int pg_3[] = {1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0};
16259 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16260 // clang-format on
16261}
16262
16263TEST_SVE(scvtf_ucvtf_s_to_double) {
16264 // clang-format off
16265 int dst_lane_size = kDRegSize;
16266 int src_lane_size = kSRegSize;
16267
16268 // Simple conversions of positive numbers which require no rounding; the
16269 // results should not depened on the rounding mode, and ucvtf and scvtf should
16270 // produce the same result.
16271 CvtfTestDataSet data_set_1[] = {
16272 {0x00000000, 0x0000000000000000, 0x0000000000000000},
16273 {0x00000001, 0x3ff0000000000000, 0x3ff0000000000000},
16274 {0x00004000, 0x40d0000000000000, 0x40d0000000000000},
16275 {0x00010000, 0x40f0000000000000, 0x40f0000000000000},
16276 {0x40000000, 0x41d0000000000000, 0x41d0000000000000}};
16277 int pg_1[] = {1, 0, 0, 0, 1};
16278 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16279
16280 CvtfTestDataSet data_set_2[] = {
16281 // Test mantissa extremities.
16282 {0x40000400, 0x41d0000100000000, 0x41d0000100000000},
16283 // The largest int32_t that fits in a double.
16284 {0x7fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16285 // Values that would be negative if treated as an int32_t.
16286 {0xffffffff, 0xbff0000000000000, 0x41efffffffe00000},
16287 {0x80000000, 0xc1e0000000000000, 0x41e0000000000000},
16288 {0x80000001, 0xc1dfffffffc00000, 0x41e0000000200000},
16289 // Check for bit pattern reproduction.
16290 {0x089abcde, 0x41a13579bc000000, 0x41a13579bc000000},
16291 {0x12345678, 0x41b2345678000000, 0x41b2345678000000},
16292 // Simple conversions of negative int32_t values. These require no rounding,
16293 // and the results should not depend on the rounding mode.
16294 {0xffffc000, 0xc0d0000000000000, 0x41effff800000000},
16295 {0xffff0000, 0xc0f0000000000000, 0x41efffe000000000},
16296 {0xc0000000, 0xc1d0000000000000, 0x41e8000000000000}};
16297 int pg_2[] = {1, 0, 1, 0, 0, 1, 1, 0, 1, 1};
16298 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16299
16300 // Note that IEEE 754 double-precision format has 52-bits fraction, so all
16301 // 32-bits integers are representable in double.
16302 // clang-format on
16303}
16304
Martyn Capewell4a9829f2020-01-30 17:41:01 +000016305TEST_SVE(sve_fadda) {
16306 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
16307 CPUFeatures::kFP,
16308 CPUFeatures::kFPHalf);
16309 START();
16310
16311 __ Ptrue(p0.VnB());
16312 __ Pfalse(p1.VnB());
16313 __ Zip1(p1.VnH(), p0.VnH(), p1.VnH());
16314
16315 __ Index(z0.VnS(), 3, 3);
16316 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16317 __ Fmov(s2, 2.0);
16318 __ Fadda(s2, p0, s2, z0.VnS());
16319
16320 __ Index(z0.VnD(), -7, -7);
16321 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16322 __ Fmov(d3, 3.0);
16323 __ Fadda(d3, p0, d3, z0.VnD());
16324
16325 __ Index(z0.VnH(), 1, 1);
16326 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16327 __ Fmov(h4, 0);
16328 __ Fadda(h4, p1, h4, z0.VnH());
16329 END();
16330
16331 if (CAN_RUN()) {
16332 RUN();
16333 // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16334 int n = core.GetSVELaneCount(kSRegSize);
16335 ASSERT_EQUAL_FP32(2 + 3 * ((n + 1) * (n / 2)), s2);
16336
16337 n /= 2; // Half as many lanes.
16338 ASSERT_EQUAL_FP64(3 + -7 * ((n + 1) * (n / 2)), d3);
16339
16340 // Sum of first n odd numbers is n^2.
16341 n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
16342 ASSERT_EQUAL_FP16(Float16(n * n), h4);
16343 }
16344}
16345
Martyn Capewellac07af12019-12-02 14:55:05 +000016346TEST_SVE(sve_extract) {
16347 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16348 START();
16349
16350 __ Index(z0.VnB(), 0, 1);
16351
16352 __ Mov(z1, z0);
16353 __ Mov(z2, z0);
16354 __ Mov(z3, z0);
16355 __ Mov(z4, z0);
16356 __ Mov(z5, z0);
16357 __ Mov(z6, z0);
16358
16359 __ Ext(z1, z1, z0, 0);
16360 __ Ext(z2, z2, z0, 1);
16361 __ Ext(z3, z3, z0, 15);
16362 __ Ext(z4, z4, z0, 31);
16363 __ Ext(z5, z5, z0, 47);
16364 __ Ext(z6, z6, z0, 255);
16365
16366 END();
16367
16368 if (CAN_RUN()) {
16369 RUN();
16370
16371 ASSERT_EQUAL_SVE(z1, z0);
16372
16373 int lane_count = core.GetSVELaneCount(kBRegSize);
16374 if (lane_count == 16) {
16375 uint64_t z2_expected[] = {0x000f0e0d0c0b0a09, 0x0807060504030201};
16376 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16377 } else {
16378 uint64_t z2_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
16379 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16380 }
16381
16382 if (lane_count == 16) {
16383 uint64_t z3_expected[] = {0x0e0d0c0b0a090807, 0x060504030201000f};
16384 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16385 } else {
16386 uint64_t z3_expected[] = {0x1e1d1c1b1a191817, 0x161514131211100f};
16387 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16388 }
16389
16390 if (lane_count < 32) {
16391 ASSERT_EQUAL_SVE(z4, z0);
16392 } else if (lane_count == 32) {
16393 uint64_t z4_expected[] = {0x0e0d0c0b0a090807, 0x060504030201001f};
16394 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16395 } else {
16396 uint64_t z4_expected[] = {0x2e2d2c2b2a292827, 0x262524232221201f};
16397 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16398 }
16399
16400 if (lane_count < 48) {
16401 ASSERT_EQUAL_SVE(z5, z0);
16402 } else if (lane_count == 48) {
16403 uint64_t z5_expected[] = {0x0e0d0c0b0a090807, 0x060504030201002f};
16404 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16405 } else {
16406 uint64_t z5_expected[] = {0x3e3d3c3b3a393837, 0x363534333231302f};
16407 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16408 }
16409
16410 if (lane_count < 256) {
16411 ASSERT_EQUAL_SVE(z6, z0);
16412 } else {
16413 uint64_t z6_expected[] = {0x0e0d0c0b0a090807, 0x06050403020100ff};
16414 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16415 }
16416 }
16417}
16418
Martyn Capewell894962f2020-02-05 15:46:44 +000016419TEST_SVE(sve_fp_paired_across) {
16420 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16421
16422 START();
16423
16424 __ Ptrue(p0.VnB());
16425 __ Pfalse(p1.VnB());
16426 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
16427 __ Zip1(p3.VnD(), p0.VnD(), p1.VnD());
16428 __ Zip1(p4.VnH(), p0.VnH(), p1.VnH());
16429
16430 __ Index(z0.VnS(), 3, 3);
16431 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16432 __ Faddv(s1, p0, z0.VnS());
16433 __ Fminv(s2, p2, z0.VnS());
16434 __ Fmaxv(s3, p2, z0.VnS());
16435
16436 __ Index(z0.VnD(), -7, -7);
16437 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16438 __ Faddv(d4, p0, z0.VnD());
16439 __ Fminv(d5, p3, z0.VnD());
16440 __ Fmaxv(d6, p3, z0.VnD());
16441
16442 __ Index(z0.VnH(), 1, 1);
16443 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16444 __ Faddv(h7, p4, z0.VnH());
16445 __ Fminv(h8, p4, z0.VnH());
16446 __ Fmaxv(h9, p4, z0.VnH());
16447
16448 __ Dup(z10.VnH(), 0);
16449 __ Fdiv(z10.VnH(), p0.Merging(), z10.VnH(), z10.VnH());
16450 __ Insr(z10.VnH(), 0x5140);
16451 __ Insr(z10.VnH(), 0xd140);
16452 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 2);
16453 __ Fmaxnmv(h11, p0, z10.VnH());
16454 __ Fmaxnmv(h12, p4, z10.VnH());
16455 __ Fminnmv(h13, p0, z10.VnH());
16456 __ Fminnmv(h14, p4, z10.VnH());
16457
16458 __ Dup(z10.VnS(), 0);
16459 __ Fdiv(z10.VnS(), p0.Merging(), z10.VnS(), z10.VnS());
16460 __ Insr(z10.VnS(), 0x42280000);
16461 __ Insr(z10.VnS(), 0xc2280000);
16462 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 4);
16463 __ Fmaxnmv(s15, p0, z10.VnS());
16464 __ Fmaxnmv(s16, p2, z10.VnS());
16465 __ Fminnmv(s17, p0, z10.VnS());
16466 __ Fminnmv(s18, p2, z10.VnS());
16467
16468 __ Dup(z10.VnD(), 0);
16469 __ Fdiv(z10.VnD(), p0.Merging(), z10.VnD(), z10.VnD());
16470 __ Insr(z10.VnD(), 0x4045000000000000);
16471 __ Insr(z10.VnD(), 0xc045000000000000);
16472 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 8);
16473 __ Fmaxnmv(d19, p0, z10.VnD());
16474 __ Fmaxnmv(d20, p3, z10.VnD());
16475 __ Fminnmv(d21, p0, z10.VnD());
16476 __ Fminnmv(d22, p3, z10.VnD());
16477 END();
16478
16479 if (CAN_RUN()) {
16480 RUN();
16481 // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16482 int n = core.GetSVELaneCount(kSRegSize);
16483 ASSERT_EQUAL_FP32(3 * ((n + 1) * (n / 2)), s1);
16484 ASSERT_EQUAL_FP32(3, s2);
16485 ASSERT_EQUAL_FP32(3 * n - 3, s3);
16486
16487 n /= 2; // Half as many lanes.
16488 ASSERT_EQUAL_FP64(-7 * ((n + 1) * (n / 2)), d4);
16489 ASSERT_EQUAL_FP64(-7 * (n - 1), d5);
16490 ASSERT_EQUAL_FP64(-7, d6);
16491
16492 // Sum of first n odd numbers is n^2.
16493 n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
16494 ASSERT_EQUAL_FP16(Float16(n * n), h7);
16495 ASSERT_EQUAL_FP16(Float16(1), h8);
16496
16497 n = core.GetSVELaneCount(kHRegSize);
16498 ASSERT_EQUAL_FP16(Float16(n - 1), h9);
16499
16500 ASSERT_EQUAL_FP16(Float16(42), h11);
16501 ASSERT_EQUAL_FP16(Float16(42), h12);
16502 ASSERT_EQUAL_FP16(Float16(-42), h13);
16503 ASSERT_EQUAL_FP16(Float16(42), h14);
16504 ASSERT_EQUAL_FP32(42, s15);
16505 ASSERT_EQUAL_FP32(42, s16);
16506 ASSERT_EQUAL_FP32(-42, s17);
16507 ASSERT_EQUAL_FP32(42, s18);
16508 ASSERT_EQUAL_FP64(42, d19);
16509 ASSERT_EQUAL_FP64(42, d20);
16510 ASSERT_EQUAL_FP64(-42, d21);
16511 ASSERT_EQUAL_FP64(42, d22);
16512 }
16513}
16514
Martyn Capewell13050ca2020-02-11 16:43:40 +000016515TEST_SVE(sve_frecpe_frsqrte) {
16516 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16517
16518 START();
16519
16520 __ Ptrue(p0.VnB());
16521
16522 __ Index(z0.VnH(), 0, 1);
16523 __ Fdup(z1.VnH(), Float16(1));
16524 __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16525 __ Insr(z1.VnH(), 0);
16526 __ Frsqrte(z2.VnH(), z1.VnH());
16527 __ Frecpe(z1.VnH(), z1.VnH());
16528
16529 __ Index(z0.VnS(), 0, 1);
16530 __ Fdup(z3.VnS(), Float16(1));
16531 __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16532 __ Insr(z3.VnS(), 0);
16533 __ Frsqrte(z4.VnS(), z3.VnS());
16534 __ Frecpe(z3.VnS(), z3.VnS());
16535
16536 __ Index(z0.VnD(), 0, 1);
16537 __ Fdup(z5.VnD(), Float16(1));
16538 __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16539 __ Insr(z5.VnD(), 0);
16540 __ Frsqrte(z6.VnD(), z5.VnD());
16541 __ Frecpe(z5.VnD(), z5.VnD());
16542 END();
16543
16544 if (CAN_RUN()) {
16545 RUN();
16546 uint64_t z1_expected[] = {0x23fc27fc2bfc2ffc, 0x33fc37fc3bfc7c00};
16547 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16548 uint64_t z2_expected[] = {0x2ffc31a433fc35a4, 0x37fc39a43bfc7c00};
16549 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16550
16551 uint64_t z3_expected[] = {0x3e7f80003eff8000, 0x3f7f80007f800000};
16552 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16553 uint64_t z4_expected[] = {0x3eff80003f348000, 0x3f7f80007f800000};
16554 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16555
16556 uint64_t z5_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16557 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16558 uint64_t z6_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16559 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16560 }
16561}
16562
Martyn Capewellefd9dc72020-02-13 10:46:29 +000016563TEST_SVE(sve_frecps_frsqrts) {
16564 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16565
16566 START();
16567 __ Ptrue(p0.VnB());
16568
16569 __ Index(z0.VnH(), 0, -1);
16570 __ Fdup(z1.VnH(), Float16(1));
16571 __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16572 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16573 __ Insr(z1.VnH(), 0);
16574 __ Frsqrts(z2.VnH(), z1.VnH(), z0.VnH());
16575 __ Frecps(z1.VnH(), z1.VnH(), z0.VnH());
16576
16577 __ Index(z0.VnS(), 0, -1);
16578 __ Fdup(z3.VnS(), Float16(1));
16579 __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16580 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16581 __ Insr(z3.VnS(), 0);
16582 __ Frsqrts(z4.VnS(), z3.VnS(), z0.VnS());
16583 __ Frecps(z3.VnS(), z3.VnS(), z0.VnS());
16584
16585 __ Index(z0.VnD(), 0, -1);
16586 __ Fdup(z5.VnD(), Float16(1));
16587 __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16588 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16589 __ Insr(z5.VnD(), 0);
16590 __ Frsqrts(z6.VnD(), z5.VnD(), z0.VnD());
16591 __ Frecps(z5.VnD(), z5.VnD(), z0.VnD());
16592 END();
16593
16594 if (CAN_RUN()) {
16595 RUN();
16596 uint64_t z1_expected[] = {0x4038406040a04100, 0x4180420042004000};
16597 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16598 uint64_t z2_expected[] = {0x3e383e603ea03f00, 0x3f80400040003e00};
16599 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16600
16601 uint64_t z3_expected[] = {0x4030000040400000, 0x4040000040000000};
16602 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16603 uint64_t z4_expected[] = {0x3ff0000040000000, 0x400000003fc00000};
16604 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16605
16606 uint64_t z5_expected[] = {0x4008000000000000, 0x4000000000000000};
16607 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16608 uint64_t z6_expected[] = {0x4000000000000000, 0x3ff8000000000000};
16609 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16610 }
16611}
16612
16613TEST_SVE(sve_ftsmul) {
16614 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16615
16616 START();
16617 __ Ptrue(p0.VnB());
16618
16619 __ Index(z0.VnH(), 0, 1);
16620 __ Rev(z1.VnH(), z0.VnH());
16621 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16622 __ Dup(z2.VnH(), 0);
16623 __ Fdiv(z2.VnH(), p0.Merging(), z2.VnH(), z2.VnH());
16624 __ Ftsmul(z3.VnH(), z0.VnH(), z1.VnH());
16625 __ Ftsmul(z4.VnH(), z2.VnH(), z1.VnH());
16626
16627 __ Index(z0.VnS(), -7, 1);
16628 __ Rev(z1.VnS(), z0.VnS());
16629 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16630 __ Dup(z2.VnS(), 0);
16631 __ Fdiv(z2.VnS(), p0.Merging(), z2.VnS(), z2.VnS());
16632 __ Ftsmul(z5.VnS(), z0.VnS(), z1.VnS());
16633 __ Ftsmul(z6.VnS(), z2.VnS(), z1.VnS());
16634
16635 __ Index(z0.VnD(), 2, -1);
16636 __ Rev(z1.VnD(), z0.VnD());
16637 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16638 __ Dup(z2.VnD(), 0);
16639 __ Fdiv(z2.VnD(), p0.Merging(), z2.VnD(), z2.VnD());
16640 __ Ftsmul(z7.VnD(), z0.VnD(), z1.VnD());
16641 __ Ftsmul(z8.VnD(), z2.VnD(), z1.VnD());
16642 END();
16643
16644 if (CAN_RUN()) {
16645 RUN();
16646 uint64_t z3_expected[] = {0x5220d0804e40cc00, 0x4880c4003c008000};
16647 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16648 uint64_t z4_expected[] = {0x7e007e007e007e00, 0x7e007e007e007e00};
16649 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16650
Jacob Bramleydfb93b52020-07-02 12:06:45 +010016651 uint64_t z5_expected[] = {0xc180000041c80000, 0xc210000042440000};
Martyn Capewellefd9dc72020-02-13 10:46:29 +000016652 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16653 uint64_t z6_expected[] = {0x7fc000007fc00000, 0x7fc000007fc00000};
16654 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16655
16656 uint64_t z7_expected[] = {0x3ff0000000000000, 0xc010000000000000};
16657 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
16658 uint64_t z8_expected[] = {0x7ff8000000000000, 0x7ff8000000000000};
16659 ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
16660 }
16661}
TatWai Chongf8d29f12020-02-16 22:53:18 -080016662
16663typedef void (MacroAssembler::*FPMulAccFn)(
16664 const ZRegister& zd,
16665 const PRegisterM& pg,
16666 const ZRegister& za,
16667 const ZRegister& zn,
16668 const ZRegister& zm,
16669 FPMacroNaNPropagationOption nan_option);
16670
16671// The `pg_inputs` is used for examining the predication correctness internally.
16672// It does not imply the value of `result` argument. `result` stands for the
16673// expected result on all-true predication.
16674template <typename T, size_t N>
16675static void FPMulAccHelper(
16676 Test* config,
16677 FPMulAccFn macro,
16678 unsigned lane_size_in_bits,
16679 const int (&pg_inputs)[N],
16680 const T (&za_inputs)[N],
16681 const T (&zn_inputs)[N],
16682 const T (&zm_inputs)[N],
16683 const uint64_t (&result)[N],
16684 FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
16685 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16686 START();
16687
16688 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
16689 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
16690 ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
16691 ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
16692
16693 uint64_t za_rawbits[N];
16694 uint64_t zn_rawbits[N];
16695 uint64_t zm_rawbits[N];
16696
16697 FPToRawbitsWithSize(za_inputs, za_rawbits, lane_size_in_bits);
16698 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
16699 FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
16700
16701 InsrHelper(&masm, za, za_rawbits);
16702 InsrHelper(&masm, zn, zn_rawbits);
16703 InsrHelper(&masm, zm, zm_rawbits);
16704
TatWai Chong2cb1b612020-03-04 23:51:21 -080016705 // Initialize `zd` with a signalling NaN.
16706 uint64_t sn = GetSignallingNan(lane_size_in_bits);
16707 __ Mov(x29, sn);
16708 __ Dup(zd, x29);
TatWai Chongf8d29f12020-02-16 22:53:18 -080016709
16710 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
16711
16712 // Fmla macro automatically selects between fmla, fmad and movprfx + fmla
16713 // Fmls `ditto` fmls, fmsb and movprfx + fmls
16714 // Fnmla `ditto` fnmla, fnmad and movprfx + fnmla
16715 // Fnmls `ditto` fnmls, fnmsb and movprfx + fnmls
16716 // based on what registers are aliased.
16717 ZRegister da_result = z10.WithLaneSize(lane_size_in_bits);
16718 ZRegister dn_result = z11.WithLaneSize(lane_size_in_bits);
16719 ZRegister dm_result = z12.WithLaneSize(lane_size_in_bits);
16720 ZRegister d_result = z13.WithLaneSize(lane_size_in_bits);
16721
16722 __ Mov(da_result, za);
16723 (masm.*macro)(da_result, p0.Merging(), da_result, zn, zm, nan_option);
16724
16725 __ Mov(dn_result, zn);
16726 (masm.*macro)(dn_result, p0.Merging(), za, dn_result, zm, nan_option);
16727
16728 __ Mov(dm_result, zm);
16729 (masm.*macro)(dm_result, p0.Merging(), za, zn, dm_result, nan_option);
16730
16731 __ Mov(d_result, zd);
16732 (masm.*macro)(d_result, p0.Merging(), za, zn, zm, nan_option);
16733
16734 END();
16735
16736 if (CAN_RUN()) {
16737 RUN();
16738
16739 ASSERT_EQUAL_SVE(za_rawbits, za);
16740 ASSERT_EQUAL_SVE(zn_rawbits, zn);
16741 ASSERT_EQUAL_SVE(zm_rawbits, zm);
16742
16743 uint64_t da_expected[N];
16744 uint64_t dn_expected[N];
16745 uint64_t dm_expected[N];
16746 uint64_t d_expected[N];
16747 for (size_t i = 0; i < N; i++) {
16748 da_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : za_rawbits[i];
16749 dn_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zn_rawbits[i];
16750 dm_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zm_rawbits[i];
TatWai Chong2cb1b612020-03-04 23:51:21 -080016751 d_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : sn;
TatWai Chongf8d29f12020-02-16 22:53:18 -080016752 }
16753
16754 ASSERT_EQUAL_SVE(da_expected, da_result);
16755 ASSERT_EQUAL_SVE(dn_expected, dn_result);
16756 ASSERT_EQUAL_SVE(dm_expected, dm_result);
16757 ASSERT_EQUAL_SVE(d_expected, d_result);
16758 }
16759}
16760
16761TEST_SVE(sve_fmla_fmad) {
16762 // fmla : zd = za + zn * zm
16763 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16764 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16765 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16766 int pg_inputs[] = {1, 1, 0, 1};
16767
16768 uint64_t fmla_result_h[] = {Float16ToRawbits(Float16(-84.0)),
16769 Float16ToRawbits(Float16(101.0)),
16770 Float16ToRawbits(Float16(33.0)),
16771 Float16ToRawbits(Float16(42.0))};
16772
16773 // `fmad` has been tested in the helper.
16774 FPMulAccHelper(config,
16775 &MacroAssembler::Fmla,
16776 kHRegSize,
16777 pg_inputs,
16778 za_inputs,
16779 zn_inputs,
16780 zm_inputs,
16781 fmla_result_h);
16782
16783 uint64_t fmla_result_s[] = {FloatToRawbits(-84.0f),
16784 FloatToRawbits(101.0f),
16785 FloatToRawbits(33.0f),
16786 FloatToRawbits(42.0f)};
16787
16788 FPMulAccHelper(config,
16789 &MacroAssembler::Fmla,
16790 kSRegSize,
16791 pg_inputs,
16792 za_inputs,
16793 zn_inputs,
16794 zm_inputs,
16795 fmla_result_s);
16796
16797 uint64_t fmla_result_d[] = {DoubleToRawbits(-84.0),
16798 DoubleToRawbits(101.0),
16799 DoubleToRawbits(33.0),
16800 DoubleToRawbits(42.0)};
16801
16802 FPMulAccHelper(config,
16803 &MacroAssembler::Fmla,
16804 kDRegSize,
16805 pg_inputs,
16806 za_inputs,
16807 zn_inputs,
16808 zm_inputs,
16809 fmla_result_d);
16810}
16811
16812TEST_SVE(sve_fmls_fmsb) {
16813 // fmls : zd = za - zn * zm
16814 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16815 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16816 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16817 int pg_inputs[] = {1, 0, 1, 1};
16818
16819 uint64_t fmls_result_h[] = {Float16ToRawbits(Float16(6.0)),
16820 Float16ToRawbits(Float16(-99.0)),
16821 Float16ToRawbits(Float16(-39.0)),
16822 Float16ToRawbits(Float16(-38.0))};
16823
16824 // `fmsb` has been tested in the helper.
16825 FPMulAccHelper(config,
16826 &MacroAssembler::Fmls,
16827 kHRegSize,
16828 pg_inputs,
16829 za_inputs,
16830 zn_inputs,
16831 zm_inputs,
16832 fmls_result_h);
16833
16834 uint64_t fmls_result_s[] = {FloatToRawbits(6.0f),
16835 FloatToRawbits(-99.0f),
16836 FloatToRawbits(-39.0f),
16837 FloatToRawbits(-38.0f)};
16838
16839 FPMulAccHelper(config,
16840 &MacroAssembler::Fmls,
16841 kSRegSize,
16842 pg_inputs,
16843 za_inputs,
16844 zn_inputs,
16845 zm_inputs,
16846 fmls_result_s);
16847
16848 uint64_t fmls_result_d[] = {DoubleToRawbits(6.0),
16849 DoubleToRawbits(-99.0),
16850 DoubleToRawbits(-39.0),
16851 DoubleToRawbits(-38.0)};
16852
16853 FPMulAccHelper(config,
16854 &MacroAssembler::Fmls,
16855 kDRegSize,
16856 pg_inputs,
16857 za_inputs,
16858 zn_inputs,
16859 zm_inputs,
16860 fmls_result_d);
16861}
16862
16863TEST_SVE(sve_fnmla_fnmad) {
16864 // fnmla : zd = -za - zn * zm
16865 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16866 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16867 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16868 int pg_inputs[] = {0, 1, 1, 1};
16869
16870 uint64_t fnmla_result_h[] = {Float16ToRawbits(Float16(84.0)),
16871 Float16ToRawbits(Float16(-101.0)),
16872 Float16ToRawbits(Float16(-33.0)),
16873 Float16ToRawbits(Float16(-42.0))};
16874
16875 // `fnmad` has been tested in the helper.
16876 FPMulAccHelper(config,
16877 &MacroAssembler::Fnmla,
16878 kHRegSize,
16879 pg_inputs,
16880 za_inputs,
16881 zn_inputs,
16882 zm_inputs,
16883 fnmla_result_h);
16884
16885 uint64_t fnmla_result_s[] = {FloatToRawbits(84.0f),
16886 FloatToRawbits(-101.0f),
16887 FloatToRawbits(-33.0f),
16888 FloatToRawbits(-42.0f)};
16889
16890 FPMulAccHelper(config,
16891 &MacroAssembler::Fnmla,
16892 kSRegSize,
16893 pg_inputs,
16894 za_inputs,
16895 zn_inputs,
16896 zm_inputs,
16897 fnmla_result_s);
16898
16899 uint64_t fnmla_result_d[] = {DoubleToRawbits(84.0),
16900 DoubleToRawbits(-101.0),
16901 DoubleToRawbits(-33.0),
16902 DoubleToRawbits(-42.0)};
16903
16904 FPMulAccHelper(config,
16905 &MacroAssembler::Fnmla,
16906 kDRegSize,
16907 pg_inputs,
16908 za_inputs,
16909 zn_inputs,
16910 zm_inputs,
16911 fnmla_result_d);
16912}
16913
16914TEST_SVE(sve_fnmls_fnmsb) {
16915 // fnmls : zd = -za + zn * zm
16916 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16917 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16918 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16919 int pg_inputs[] = {1, 1, 1, 0};
16920
16921 uint64_t fnmls_result_h[] = {Float16ToRawbits(Float16(-6.0)),
16922 Float16ToRawbits(Float16(99.0)),
16923 Float16ToRawbits(Float16(39.0)),
16924 Float16ToRawbits(Float16(38.0))};
16925
16926 // `fnmsb` has been tested in the helper.
16927 FPMulAccHelper(config,
16928 &MacroAssembler::Fnmls,
16929 kHRegSize,
16930 pg_inputs,
16931 za_inputs,
16932 zn_inputs,
16933 zm_inputs,
16934 fnmls_result_h);
16935
16936 uint64_t fnmls_result_s[] = {FloatToRawbits(-6.0f),
16937 FloatToRawbits(99.0f),
16938 FloatToRawbits(39.0f),
16939 FloatToRawbits(38.0f)};
16940
16941 FPMulAccHelper(config,
16942 &MacroAssembler::Fnmls,
16943 kSRegSize,
16944 pg_inputs,
16945 za_inputs,
16946 zn_inputs,
16947 zm_inputs,
16948 fnmls_result_s);
16949
16950 uint64_t fnmls_result_d[] = {DoubleToRawbits(-6.0),
16951 DoubleToRawbits(99.0),
16952 DoubleToRawbits(39.0),
16953 DoubleToRawbits(38.0)};
16954
16955 FPMulAccHelper(config,
16956 &MacroAssembler::Fnmls,
16957 kDRegSize,
16958 pg_inputs,
16959 za_inputs,
16960 zn_inputs,
16961 zm_inputs,
16962 fnmls_result_d);
16963}
16964
Martyn Capewellc7501512020-03-16 10:35:33 +000016965// Create a pattern in dst where the value of each element in src is incremented
16966// by the segment number. This allows varying a short input by a predictable
16967// pattern for each segment.
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070016968static void FPSegmentPatternHelper(MacroAssembler* masm,
16969 const ZRegister& dst,
16970 const PRegisterM& ptrue,
16971 const ZRegister& src) {
Martyn Capewellc7501512020-03-16 10:35:33 +000016972 VIXL_ASSERT(AreSameLaneSize(dst, src));
16973 UseScratchRegisterScope temps(masm);
16974 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
Martyn Capewellc7501512020-03-16 10:35:33 +000016975 masm->Index(ztmp, 0, 1);
16976 masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
16977 masm->Scvtf(ztmp, ptrue, ztmp);
16978 masm->Fadd(dst, src, ztmp);
16979}
16980
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016981typedef void (MacroAssembler::*FPMulAccIdxFn)(const ZRegister& zd,
16982 const ZRegister& za,
16983 const ZRegister& zn,
16984 const ZRegister& zm,
16985 int index);
16986
16987template <typename T, size_t N>
16988static void FPMulAccIdxHelper(Test* config,
16989 FPMulAccFn macro,
16990 FPMulAccIdxFn macro_idx,
16991 const T (&za_inputs)[N],
16992 const T (&zn_inputs)[N],
16993 const T (&zm_inputs)[N]) {
16994 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16995 START();
16996
Martyn Capewellc7501512020-03-16 10:35:33 +000016997 __ Ptrue(p0.VnB());
16998
16999 // Repeat indexed vector across up to 2048-bit VL.
17000 for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i += N) {
17001 InsrHelper(&masm, z30.VnD(), zm_inputs);
17002 }
17003
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017004 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z30.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017005
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017006 InsrHelper(&masm, z1.VnD(), zn_inputs);
17007 InsrHelper(&masm, z2.VnD(), za_inputs);
17008
17009 __ Mov(z3, z0);
17010 (masm.*macro_idx)(z3.VnH(), z2.VnH(), z1.VnH(), z3.VnH(), 0); // zd == zm
17011 __ Mov(z4, z1);
17012 (masm.*macro_idx)(z4.VnH(), z2.VnH(), z4.VnH(), z0.VnH(), 1); // zd == zn
17013 __ Mov(z5, z2);
17014 (masm.*macro_idx)(z5.VnH(), z5.VnH(), z1.VnH(), z0.VnH(), 4); // zd == za
17015 (masm.*macro_idx)(z6.VnH(), z2.VnH(), z1.VnH(), z0.VnH(), 7);
17016
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017017 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z30.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017018
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017019 __ Mov(z7, z0);
17020 (masm.*macro_idx)(z7.VnS(), z2.VnS(), z1.VnS(), z7.VnS(), 0); // zd == zm
17021 __ Mov(z8, z1);
17022 (masm.*macro_idx)(z8.VnS(), z2.VnS(), z8.VnS(), z0.VnS(), 1); // zd == zn
17023 __ Mov(z9, z2);
17024 (masm.*macro_idx)(z9.VnS(), z9.VnS(), z1.VnS(), z0.VnS(), 2); // zd == za
17025 (masm.*macro_idx)(z10.VnS(), z2.VnS(), z1.VnS(), z0.VnS(), 3);
17026
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017027 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
Martyn Capewellc7501512020-03-16 10:35:33 +000017028
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017029 __ Mov(z11, z0);
17030 (masm.*macro_idx)(z11.VnD(), z2.VnD(), z1.VnD(), z11.VnD(), 0); // zd == zm
17031 __ Mov(z12, z1);
17032 (masm.*macro_idx)(z12.VnD(), z2.VnD(), z12.VnD(), z0.VnD(), 1); // zd == zn
17033 __ Mov(z13, z2);
17034 (masm.*macro_idx)(z13.VnD(), z13.VnD(), z1.VnD(), z0.VnD(), 0); // zd == za
17035 __ Mov(z14, z0);
17036 // zd == zn == zm
17037 (masm.*macro_idx)(z14.VnD(), z2.VnD(), z14.VnD(), z14.VnD(), 1);
17038
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017039 // Indexed form of Fmla and Fmls won't swap argument, passing strict NaN
17040 // propagation mode to ensure the following macros don't swap argument in
17041 // any cases.
17042 FPMacroNaNPropagationOption option = StrictNaNPropagation;
17043 // Compute the results using other instructions.
Martyn Capewellc7501512020-03-16 10:35:33 +000017044 __ Dup(z0.VnH(), z30.VnH(), 0);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017045 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017046 (masm.*macro)(z15.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17047 __ Dup(z0.VnH(), z30.VnH(), 1);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017048 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017049 (masm.*macro)(z16.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17050 __ Dup(z0.VnH(), z30.VnH(), 4);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017051 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017052 (masm.*macro)(z17.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17053 __ Dup(z0.VnH(), z30.VnH(), 7);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017054 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017055 (masm.*macro)(z18.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017056
Martyn Capewellc7501512020-03-16 10:35:33 +000017057 __ Dup(z0.VnS(), z30.VnS(), 0);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017058 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017059 (masm.*macro)(z19.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17060 __ Dup(z0.VnS(), z30.VnS(), 1);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017061 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017062 (masm.*macro)(z20.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17063 __ Dup(z0.VnS(), z30.VnS(), 2);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017064 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017065 (masm.*macro)(z21.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17066 __ Dup(z0.VnS(), z30.VnS(), 3);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017067 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017068 (masm.*macro)(z22.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017069
Martyn Capewellc7501512020-03-16 10:35:33 +000017070 __ Dup(z0.VnD(), z30.VnD(), 0);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017071 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
Martyn Capewellc7501512020-03-16 10:35:33 +000017072 (masm.*macro)(z23.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
17073 __ Dup(z0.VnD(), z30.VnD(), 1);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017074 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
Martyn Capewellc7501512020-03-16 10:35:33 +000017075 (masm.*macro)(z24.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
Jacob Bramley8caa8732020-07-01 20:22:38 +010017076 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
17077 __ Dup(z29.VnD(), z30.VnD(), 1);
17078 FPSegmentPatternHelper(&masm, z29.VnD(), p0.Merging(), z29.VnD());
17079 (masm.*macro)(z25.VnD(), p0.Merging(), z2.VnD(), z0.VnD(), z29.VnD(), option);
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017080
17081 END();
17082
17083 if (CAN_RUN()) {
17084 RUN();
17085
17086 ASSERT_EQUAL_SVE(z15.VnH(), z3.VnH());
17087 ASSERT_EQUAL_SVE(z16.VnH(), z4.VnH());
17088 ASSERT_EQUAL_SVE(z17.VnH(), z5.VnH());
17089 ASSERT_EQUAL_SVE(z18.VnH(), z6.VnH());
17090
17091 ASSERT_EQUAL_SVE(z19.VnS(), z7.VnS());
17092 ASSERT_EQUAL_SVE(z20.VnS(), z8.VnS());
17093 ASSERT_EQUAL_SVE(z21.VnS(), z9.VnS());
17094 ASSERT_EQUAL_SVE(z22.VnS(), z10.VnS());
17095
17096 ASSERT_EQUAL_SVE(z23.VnD(), z11.VnD());
17097 ASSERT_EQUAL_SVE(z24.VnD(), z12.VnD());
17098 ASSERT_EQUAL_SVE(z11.VnD(), z13.VnD());
17099 ASSERT_EQUAL_SVE(z25.VnD(), z14.VnD());
17100 }
17101}
17102
17103TEST_SVE(sve_fmla_fmls_index) {
17104 uint64_t zm_inputs_1[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
17105 uint64_t zn_inputs_1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
17106 uint64_t za_inputs_1[] = {0x3c004000bc00c000, 0x64006800e400e800};
17107
17108 // Using the vector form of Fmla and Fmls to verify the indexed form.
17109 FPMulAccIdxHelper(config,
17110 &MacroAssembler::Fmla, // vector form
17111 &MacroAssembler::Fmla, // indexed form
17112 za_inputs_1,
17113 zn_inputs_1,
17114 zm_inputs_1);
17115
17116 FPMulAccIdxHelper(config,
17117 &MacroAssembler::Fmls, // vector form
17118 &MacroAssembler::Fmls, // indexed form
17119 za_inputs_1,
17120 zn_inputs_1,
17121 zm_inputs_1);
17122
17123 uint64_t zm_inputs_2[] = {0x7ff5555511111111, // NaN
17124 0xfff0000000000000}; // Infinity
17125 uint64_t zn_inputs_2[] = {0x7f9511117fc00000, // NaN
17126 0x7f800000ff800000}; // Infinity
17127 uint64_t za_inputs_2[] = {0x7c11000000007e00, // NaN
17128 0x000000007c00fc00}; // Infinity
17129 FPMulAccIdxHelper(config,
17130 &MacroAssembler::Fmla, // vector form
17131 &MacroAssembler::Fmla, // indexed form
17132 za_inputs_2,
17133 zn_inputs_2,
17134 zm_inputs_2);
17135
17136 FPMulAccIdxHelper(config,
17137 &MacroAssembler::Fmls, // vector form
17138 &MacroAssembler::Fmls, // indexed form
17139 za_inputs_2,
17140 zn_inputs_2,
17141 zm_inputs_2);
17142}
17143
TatWai Chongf8d29f12020-02-16 22:53:18 -080017144// Execute a number of instructions which all use ProcessNaNs, and check that
17145// they all propagate NaNs correctly.
17146template <typename Ti, typename Td, size_t N>
17147static void ProcessNaNsHelper(Test* config,
17148 int lane_size_in_bits,
17149 const Ti (&zn_inputs)[N],
17150 const Ti (&zm_inputs)[N],
17151 const Td (&zd_expected)[N],
17152 FPMacroNaNPropagationOption nan_option) {
17153 ArithFn arith_unpredicated_macro[] = {&MacroAssembler::Fadd,
17154 &MacroAssembler::Fsub,
17155 &MacroAssembler::Fmul};
17156
17157 for (size_t i = 0; i < ArrayLength(arith_unpredicated_macro); i++) {
17158 FPBinArithHelper(config,
17159 arith_unpredicated_macro[i],
17160 lane_size_in_bits,
17161 zn_inputs,
17162 zm_inputs,
17163 zd_expected);
17164 }
17165
17166 FPArithPredicatedFn arith_predicated_macro[] = {&MacroAssembler::Fmax,
17167 &MacroAssembler::Fmin};
17168 int pg_inputs[N];
17169 // With an all-true predicate, this helper aims to compare with special
17170 // numbers.
17171 for (size_t i = 0; i < N; i++) {
17172 pg_inputs[i] = 1;
17173 }
17174
17175 // fdivr propagates the quotient (Zm) preferentially, so we don't actually
17176 // need any special handling for StrictNaNPropagation.
17177 FPBinArithHelper(config,
17178 NULL,
17179 &MacroAssembler::Fdiv,
17180 lane_size_in_bits,
17181 // With an all-true predicate, the value in zd is
17182 // irrelevant to the operations.
17183 zn_inputs,
17184 pg_inputs,
17185 zn_inputs,
17186 zm_inputs,
17187 zd_expected);
17188
17189 for (size_t i = 0; i < ArrayLength(arith_predicated_macro); i++) {
17190 FPBinArithHelper(config,
17191 arith_predicated_macro[i],
17192 NULL,
17193 lane_size_in_bits,
17194 // With an all-true predicate, the value in zd is
17195 // irrelevant to the operations.
17196 zn_inputs,
17197 pg_inputs,
17198 zn_inputs,
17199 zm_inputs,
17200 zd_expected,
17201 nan_option);
17202 }
17203}
17204
17205template <typename Ti, typename Td, size_t N>
17206static void ProcessNaNsHelper3(Test* config,
17207 int lane_size_in_bits,
17208 const Ti (&za_inputs)[N],
17209 const Ti (&zn_inputs)[N],
17210 const Ti (&zm_inputs)[N],
17211 const Td (&zd_expected_fmla)[N],
17212 const Td (&zd_expected_fmls)[N],
17213 const Td (&zd_expected_fnmla)[N],
17214 const Td (&zd_expected_fnmls)[N],
17215 FPMacroNaNPropagationOption nan_option) {
17216 int pg_inputs[N];
17217 // With an all-true predicate, this helper aims to compare with special
17218 // numbers.
17219 for (size_t i = 0; i < N; i++) {
17220 pg_inputs[i] = 1;
17221 }
17222
17223 FPMulAccHelper(config,
17224 &MacroAssembler::Fmla,
17225 lane_size_in_bits,
17226 pg_inputs,
17227 za_inputs,
17228 zn_inputs,
17229 zm_inputs,
17230 zd_expected_fmla,
17231 nan_option);
17232
17233 FPMulAccHelper(config,
17234 &MacroAssembler::Fmls,
17235 lane_size_in_bits,
17236 pg_inputs,
17237 za_inputs,
17238 zn_inputs,
17239 zm_inputs,
17240 zd_expected_fmls,
17241 nan_option);
17242
17243 FPMulAccHelper(config,
17244 &MacroAssembler::Fnmla,
17245 lane_size_in_bits,
17246 pg_inputs,
17247 za_inputs,
17248 zn_inputs,
17249 zm_inputs,
17250 zd_expected_fnmla,
17251 nan_option);
17252
17253 FPMulAccHelper(config,
17254 &MacroAssembler::Fnmls,
17255 lane_size_in_bits,
17256 pg_inputs,
17257 za_inputs,
17258 zn_inputs,
17259 zm_inputs,
17260 zd_expected_fnmls,
17261 nan_option);
17262}
17263
17264TEST_SVE(sve_process_nans_double) {
17265 // Use non-standard NaNs to check that the payload bits are preserved.
17266 double sa = RawbitsToDouble(0x7ff5555511111111);
17267 double sn = RawbitsToDouble(0x7ff5555522222222);
17268 double sm = RawbitsToDouble(0x7ff5555533333333);
17269 double qa = RawbitsToDouble(0x7ffaaaaa11111111);
17270 double qn = RawbitsToDouble(0x7ffaaaaa22222222);
17271 double qm = RawbitsToDouble(0x7ffaaaaa33333333);
17272 VIXL_ASSERT(IsSignallingNaN(sa));
17273 VIXL_ASSERT(IsSignallingNaN(sn));
17274 VIXL_ASSERT(IsSignallingNaN(sm));
17275 VIXL_ASSERT(IsQuietNaN(qa));
17276 VIXL_ASSERT(IsQuietNaN(qn));
17277 VIXL_ASSERT(IsQuietNaN(qm));
17278
17279 // The input NaNs after passing through ProcessNaN.
17280 uint64_t sa_proc = 0x7ffd555511111111;
17281 uint64_t sn_proc = 0x7ffd555522222222;
17282 uint64_t sm_proc = 0x7ffd555533333333;
17283 uint64_t qa_proc = DoubleToRawbits(qa);
17284 uint64_t qn_proc = DoubleToRawbits(qn);
17285 uint64_t qm_proc = DoubleToRawbits(qm);
17286 uint64_t sa_proc_n = sa_proc ^ kDSignMask;
17287 uint64_t sn_proc_n = sn_proc ^ kDSignMask;
17288 uint64_t qa_proc_n = qa_proc ^ kDSignMask;
17289 uint64_t qn_proc_n = qn_proc ^ kDSignMask;
17290
17291 // Quiet NaNs are propagated.
17292 double zn_inputs_1[] = {qn, 0.0, 0.0, qm, qn, qm};
17293 double zm_inputs_1[] = {0.0, qn, qm, 0.0, qm, qn};
17294 uint64_t zd_expected_1[] =
17295 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17296
17297 ProcessNaNsHelper(config,
17298 kDRegSize,
17299 zn_inputs_1,
17300 zm_inputs_1,
17301 zd_expected_1,
17302 StrictNaNPropagation);
17303
17304 // Signalling NaNs are propagated.
17305 double zn_inputs_2[] = {sn, 0.0, 0.0, sm, sn, sm};
17306 double zm_inputs_2[] = {0.0, sn, sm, 0.0, sm, sn};
17307 uint64_t zd_expected_2[] =
17308 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17309 ProcessNaNsHelper(config,
17310 kDRegSize,
17311 zn_inputs_2,
17312 zm_inputs_2,
17313 zd_expected_2,
17314 StrictNaNPropagation);
17315
17316 // Signalling NaNs take precedence over quiet NaNs.
17317 double zn_inputs_3[] = {sn, qn, sn, sn, qn};
17318 double zm_inputs_3[] = {qm, sm, sm, qn, sn};
17319 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17320 ProcessNaNsHelper(config,
17321 kDRegSize,
17322 zn_inputs_3,
17323 zm_inputs_3,
17324 zd_expected_3,
17325 StrictNaNPropagation);
17326
17327 double za_inputs_4[] = {qa, qa, 0.0, 0.0, qa, qa};
17328 double zn_inputs_4[] = {qn, 0.0, 0.0, qn, qn, qn};
17329 double zm_inputs_4[] = {0.0, qm, qm, qm, qm, 0.0};
17330
17331 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17332 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17333 // If `m` is propagated, its sign is never inverted.
17334 uint64_t zd_expected_fmla_4[] =
17335 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17336 uint64_t zd_expected_fmls_4[] =
17337 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17338 uint64_t zd_expected_fnmla_4[] =
17339 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17340 uint64_t zd_expected_fnmls_4[] =
17341 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17342
17343 ProcessNaNsHelper3(config,
17344 kDRegSize,
17345 za_inputs_4,
17346 zn_inputs_4,
17347 zm_inputs_4,
17348 zd_expected_fmla_4,
17349 zd_expected_fmls_4,
17350 zd_expected_fnmla_4,
17351 zd_expected_fnmls_4,
17352 StrictNaNPropagation);
17353
17354 // Signalling NaNs take precedence over quiet NaNs.
17355 double za_inputs_5[] = {qa, qa, sa, sa, sa};
17356 double zn_inputs_5[] = {qn, sn, sn, sn, qn};
17357 double zm_inputs_5[] = {sm, qm, sm, qa, sm};
17358 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17359 uint64_t zd_expected_fmls_5[] = {sm_proc,
17360 sn_proc_n,
17361 sa_proc,
17362 sa_proc,
17363 sa_proc};
17364 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17365 sn_proc_n,
17366 sa_proc_n,
17367 sa_proc_n,
17368 sa_proc_n};
17369 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17370 sn_proc,
17371 sa_proc_n,
17372 sa_proc_n,
17373 sa_proc_n};
17374
17375 ProcessNaNsHelper3(config,
17376 kDRegSize,
17377 za_inputs_5,
17378 zn_inputs_5,
17379 zm_inputs_5,
17380 zd_expected_fmla_5,
17381 zd_expected_fmls_5,
17382 zd_expected_fnmla_5,
17383 zd_expected_fnmls_5,
17384 StrictNaNPropagation);
17385
17386 const double inf = kFP64PositiveInfinity;
17387 const double inf_n = kFP64NegativeInfinity;
17388 uint64_t inf_proc = DoubleToRawbits(inf);
17389 uint64_t inf_proc_n = DoubleToRawbits(inf_n);
17390 uint64_t d_inf_proc = DoubleToRawbits(kFP64DefaultNaN);
17391
17392 double za_inputs_6[] = {qa, qa, 0.0f, -0.0f, qa, sa};
17393 double zn_inputs_6[] = {inf, -0.0f, -0.0f, inf, inf_n, inf};
17394 double zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17395
17396 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17397 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17398 // quiet_nan.
17399 uint64_t zd_expected_fmla_6[] =
17400 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17401 uint64_t zd_expected_fmls_6[] =
17402 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17403 uint64_t zd_expected_fnmla_6[] =
17404 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17405 uint64_t zd_expected_fnmls_6[] =
17406 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17407
17408 ProcessNaNsHelper3(config,
17409 kDRegSize,
17410 za_inputs_6,
17411 zn_inputs_6,
17412 zm_inputs_6,
17413 zd_expected_fmla_6,
17414 zd_expected_fmls_6,
17415 zd_expected_fnmla_6,
17416 zd_expected_fnmls_6,
17417 StrictNaNPropagation);
17418}
17419
17420TEST_SVE(sve_process_nans_float) {
17421 // Use non-standard NaNs to check that the payload bits are preserved.
17422 float sa = RawbitsToFloat(0x7f951111);
17423 float sn = RawbitsToFloat(0x7f952222);
17424 float sm = RawbitsToFloat(0x7f953333);
17425 float qa = RawbitsToFloat(0x7fea1111);
17426 float qn = RawbitsToFloat(0x7fea2222);
17427 float qm = RawbitsToFloat(0x7fea3333);
17428 VIXL_ASSERT(IsSignallingNaN(sa));
17429 VIXL_ASSERT(IsSignallingNaN(sn));
17430 VIXL_ASSERT(IsSignallingNaN(sm));
17431 VIXL_ASSERT(IsQuietNaN(qa));
17432 VIXL_ASSERT(IsQuietNaN(qn));
17433 VIXL_ASSERT(IsQuietNaN(qm));
17434
17435 // The input NaNs after passing through ProcessNaN.
17436 uint32_t sa_proc = 0x7fd51111;
17437 uint32_t sn_proc = 0x7fd52222;
17438 uint32_t sm_proc = 0x7fd53333;
17439 uint32_t qa_proc = FloatToRawbits(qa);
17440 uint32_t qn_proc = FloatToRawbits(qn);
17441 uint32_t qm_proc = FloatToRawbits(qm);
17442 uint32_t sa_proc_n = sa_proc ^ kSSignMask;
17443 uint32_t sn_proc_n = sn_proc ^ kSSignMask;
17444 uint32_t qa_proc_n = qa_proc ^ kSSignMask;
17445 uint32_t qn_proc_n = qn_proc ^ kSSignMask;
17446
17447 // Quiet NaNs are propagated.
17448 float zn_inputs_1[] = {qn, 0.0f, 0.0f, qm, qn, qm};
17449 float zm_inputs_1[] = {0.0f, qn, qm, 0.0f, qm, qn};
17450 uint64_t zd_expected_1[] =
17451 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17452
17453 ProcessNaNsHelper(config,
17454 kSRegSize,
17455 zn_inputs_1,
17456 zm_inputs_1,
17457 zd_expected_1,
17458 StrictNaNPropagation);
17459
17460 // Signalling NaNs are propagated.
17461 float zn_inputs_2[] = {sn, 0.0f, 0.0f, sm, sn, sm};
17462 float zm_inputs_2[] = {0.0f, sn, sm, 0.0f, sm, sn};
17463 uint64_t zd_expected_2[] =
17464 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17465 ProcessNaNsHelper(config,
17466 kSRegSize,
17467 zn_inputs_2,
17468 zm_inputs_2,
17469 zd_expected_2,
17470 StrictNaNPropagation);
17471
17472 // Signalling NaNs take precedence over quiet NaNs.
17473 float zn_inputs_3[] = {sn, qn, sn, sn, qn};
17474 float zm_inputs_3[] = {qm, sm, sm, qn, sn};
17475 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17476 ProcessNaNsHelper(config,
17477 kSRegSize,
17478 zn_inputs_3,
17479 zm_inputs_3,
17480 zd_expected_3,
17481 StrictNaNPropagation);
17482
17483 float za_inputs_4[] = {qa, qa, 0.0f, 0.0f, qa, qa};
17484 float zn_inputs_4[] = {qn, 0.0f, 0.0f, qn, qn, qn};
17485 float zm_inputs_4[] = {0.0f, qm, qm, qm, qm, 0.0f};
17486
17487 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17488 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17489 // If `m` is propagated, its sign is never inverted.
17490 uint64_t zd_expected_fmla_4[] =
17491 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17492 uint64_t zd_expected_fmls_4[] =
17493 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17494 uint64_t zd_expected_fnmla_4[] =
17495 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17496 uint64_t zd_expected_fnmls_4[] =
17497 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17498
17499 ProcessNaNsHelper3(config,
17500 kSRegSize,
17501 za_inputs_4,
17502 zn_inputs_4,
17503 zm_inputs_4,
17504 zd_expected_fmla_4,
17505 zd_expected_fmls_4,
17506 zd_expected_fnmla_4,
17507 zd_expected_fnmls_4,
17508 StrictNaNPropagation);
17509
17510 // Signalling NaNs take precedence over quiet NaNs.
17511 float za_inputs_5[] = {qa, qa, sa, sa, sa};
17512 float zn_inputs_5[] = {qn, sn, sn, sn, qn};
17513 float zm_inputs_5[] = {sm, qm, sm, qa, sm};
17514 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17515 uint64_t zd_expected_fmls_5[] = {sm_proc,
17516 sn_proc_n,
17517 sa_proc,
17518 sa_proc,
17519 sa_proc};
17520 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17521 sn_proc_n,
17522 sa_proc_n,
17523 sa_proc_n,
17524 sa_proc_n};
17525 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17526 sn_proc,
17527 sa_proc_n,
17528 sa_proc_n,
17529 sa_proc_n};
17530
17531 ProcessNaNsHelper3(config,
17532 kSRegSize,
17533 za_inputs_5,
17534 zn_inputs_5,
17535 zm_inputs_5,
17536 zd_expected_fmla_5,
17537 zd_expected_fmls_5,
17538 zd_expected_fnmla_5,
17539 zd_expected_fnmls_5,
17540 StrictNaNPropagation);
17541
17542 const float inf = kFP32PositiveInfinity;
17543 const float inf_n = kFP32NegativeInfinity;
17544 uint32_t inf_proc = FloatToRawbits(inf);
17545 uint32_t inf_proc_n = FloatToRawbits(inf_n);
17546 uint32_t d_inf_proc = FloatToRawbits(kFP32DefaultNaN);
17547
17548 float za_inputs_6[] = {qa, qa, 0.0f, 0.0f, qa, sa};
17549 float zn_inputs_6[] = {inf, 0.0f, 0.0f, inf, inf_n, inf};
17550 float zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17551
17552 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17553 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17554 // quiet_nan.
17555 uint64_t zd_expected_fmla_6[] =
17556 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17557 uint64_t zd_expected_fmls_6[] =
17558 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17559 uint64_t zd_expected_fnmla_6[] =
17560 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17561 uint64_t zd_expected_fnmls_6[] =
17562 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17563
17564 ProcessNaNsHelper3(config,
17565 kSRegSize,
17566 za_inputs_6,
17567 zn_inputs_6,
17568 zm_inputs_6,
17569 zd_expected_fmla_6,
17570 zd_expected_fmls_6,
17571 zd_expected_fnmla_6,
17572 zd_expected_fnmls_6,
17573 StrictNaNPropagation);
17574}
17575
17576TEST_SVE(sve_process_nans_half) {
17577 // Use non-standard NaNs to check that the payload bits are preserved.
17578 Float16 sa(RawbitsToFloat16(0x7c11));
17579 Float16 sn(RawbitsToFloat16(0x7c22));
17580 Float16 sm(RawbitsToFloat16(0x7c33));
17581 Float16 qa(RawbitsToFloat16(0x7e44));
17582 Float16 qn(RawbitsToFloat16(0x7e55));
17583 Float16 qm(RawbitsToFloat16(0x7e66));
17584 VIXL_ASSERT(IsSignallingNaN(sa));
17585 VIXL_ASSERT(IsSignallingNaN(sn));
17586 VIXL_ASSERT(IsSignallingNaN(sm));
17587 VIXL_ASSERT(IsQuietNaN(qa));
17588 VIXL_ASSERT(IsQuietNaN(qn));
17589 VIXL_ASSERT(IsQuietNaN(qm));
17590
17591 // The input NaNs after passing through ProcessNaN.
17592 uint16_t sa_proc = 0x7e11;
17593 uint16_t sn_proc = 0x7e22;
17594 uint16_t sm_proc = 0x7e33;
17595 uint16_t qa_proc = Float16ToRawbits(qa);
17596 uint16_t qn_proc = Float16ToRawbits(qn);
17597 uint16_t qm_proc = Float16ToRawbits(qm);
17598 uint16_t sa_proc_n = sa_proc ^ kHSignMask;
17599 uint16_t sn_proc_n = sn_proc ^ kHSignMask;
17600 uint16_t qa_proc_n = qa_proc ^ kHSignMask;
17601 uint16_t qn_proc_n = qn_proc ^ kHSignMask;
17602 Float16 zero(0.0);
17603
17604 // Quiet NaNs are propagated.
17605 Float16 zn_inputs_1[] = {qn, zero, zero, qm, qn, qm};
17606 Float16 zm_inputs_1[] = {zero, qn, qm, zero, qm, qn};
17607 uint64_t zd_expected_1[] =
17608 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17609
17610 ProcessNaNsHelper(config,
17611 kHRegSize,
17612 zn_inputs_1,
17613 zm_inputs_1,
17614 zd_expected_1,
17615 StrictNaNPropagation);
17616
17617 // Signalling NaNs are propagated.
17618 Float16 zn_inputs_2[] = {sn, zero, zero, sm, sn, sm};
17619 Float16 zm_inputs_2[] = {zero, sn, sm, zero, sm, sn};
17620 uint64_t zd_expected_2[] =
17621 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17622 ProcessNaNsHelper(config,
17623 kHRegSize,
17624 zn_inputs_2,
17625 zm_inputs_2,
17626 zd_expected_2,
17627 StrictNaNPropagation);
17628
17629 // Signalling NaNs take precedence over quiet NaNs.
17630 Float16 zn_inputs_3[] = {sn, qn, sn, sn, qn};
17631 Float16 zm_inputs_3[] = {qm, sm, sm, qn, sn};
17632 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17633 ProcessNaNsHelper(config,
17634 kHRegSize,
17635 zn_inputs_3,
17636 zm_inputs_3,
17637 zd_expected_3,
17638 StrictNaNPropagation);
17639
17640 Float16 za_inputs_4[] = {qa, qa, zero, zero, qa, qa};
17641 Float16 zn_inputs_4[] = {qn, zero, zero, qn, qn, qn};
17642 Float16 zm_inputs_4[] = {zero, qm, qm, qm, qm, zero};
17643
17644 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17645 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17646 // If `m` is propagated, its sign is never inverted.
17647 uint64_t zd_expected_fmla_4[] =
17648 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17649 uint64_t zd_expected_fmls_4[] =
17650 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17651 uint64_t zd_expected_fnmla_4[] =
17652 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17653 uint64_t zd_expected_fnmls_4[] =
17654 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17655
17656 ProcessNaNsHelper3(config,
17657 kHRegSize,
17658 za_inputs_4,
17659 zn_inputs_4,
17660 zm_inputs_4,
17661 zd_expected_fmla_4,
17662 zd_expected_fmls_4,
17663 zd_expected_fnmla_4,
17664 zd_expected_fnmls_4,
17665 StrictNaNPropagation);
17666
17667 // Signalling NaNs take precedence over quiet NaNs.
17668 Float16 za_inputs_5[] = {qa, qa, sa, sa, sa};
17669 Float16 zn_inputs_5[] = {qn, sn, sn, sn, qn};
17670 Float16 zm_inputs_5[] = {sm, qm, sm, qa, sm};
17671 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17672 uint64_t zd_expected_fmls_5[] = {sm_proc,
17673 sn_proc_n,
17674 sa_proc,
17675 sa_proc,
17676 sa_proc};
17677 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17678 sn_proc_n,
17679 sa_proc_n,
17680 sa_proc_n,
17681 sa_proc_n};
17682 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17683 sn_proc,
17684 sa_proc_n,
17685 sa_proc_n,
17686 sa_proc_n};
17687
17688 ProcessNaNsHelper3(config,
17689 kHRegSize,
17690 za_inputs_5,
17691 zn_inputs_5,
17692 zm_inputs_5,
17693 zd_expected_fmla_5,
17694 zd_expected_fmls_5,
17695 zd_expected_fnmla_5,
17696 zd_expected_fnmls_5,
17697 StrictNaNPropagation);
17698
17699 const Float16 inf = kFP16PositiveInfinity;
17700 const Float16 inf_n = kFP16NegativeInfinity;
17701 uint64_t inf_proc = Float16ToRawbits(inf);
17702 uint64_t inf_proc_n = Float16ToRawbits(inf_n);
17703 uint64_t d_inf_proc = Float16ToRawbits(kFP16DefaultNaN);
17704
17705 Float16 za_inputs_6[] = {qa, qa, zero, zero, qa, sa};
17706 Float16 zn_inputs_6[] = {inf, zero, zero, inf, inf_n, inf};
17707 Float16 zm_inputs_6[] = {zero, inf_n, inf, inf, inf, zero};
17708
17709 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17710 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17711 // quiet_nan.
17712 uint64_t zd_expected_fmla_6[] =
17713 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17714 uint64_t zd_expected_fmls_6[] =
17715 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17716 uint64_t zd_expected_fnmla_6[] =
17717 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17718 uint64_t zd_expected_fnmls_6[] =
17719 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17720
17721 ProcessNaNsHelper3(config,
17722 kHRegSize,
17723 za_inputs_6,
17724 zn_inputs_6,
17725 zm_inputs_6,
17726 zd_expected_fmla_6,
17727 zd_expected_fmls_6,
17728 zd_expected_fnmla_6,
17729 zd_expected_fnmls_6,
17730 StrictNaNPropagation);
17731}
17732
TatWai Chong47c26842020-02-10 01:51:32 -080017733typedef void (MacroAssembler::*FCmpFn)(const PRegisterWithLaneSize& pd,
17734 const PRegisterZ& pg,
17735 const ZRegister& zn,
17736 const ZRegister& zm);
17737
TatWai Chonge3775132020-02-16 22:13:17 -080017738typedef void (MacroAssembler::*FCmpZeroFn)(const PRegisterWithLaneSize& pd,
17739 const PRegisterZ& pg,
Jacob Bramley5a5e71f2020-07-02 13:54:58 +010017740 const ZRegister& zn,
17741 double zero);
TatWai Chonge3775132020-02-16 22:13:17 -080017742
TatWai Chong47c26842020-02-10 01:51:32 -080017743typedef void (MacroAssembler::*CmpFn)(const PRegisterWithLaneSize& pd,
17744 const PRegisterZ& pg,
17745 const ZRegister& zn,
17746 const ZRegister& zm);
17747
17748static FCmpFn GetFpAbsCompareFn(Condition cond) {
17749 switch (cond) {
17750 case ge:
17751 return &MacroAssembler::Facge;
17752 case gt:
17753 return &MacroAssembler::Facgt;
17754 case le:
17755 return &MacroAssembler::Facle;
17756 case lt:
17757 return &MacroAssembler::Faclt;
17758 default:
17759 VIXL_UNIMPLEMENTED();
17760 return NULL;
17761 }
17762}
17763
17764static FCmpFn GetFpCompareFn(Condition cond) {
17765 switch (cond) {
17766 case ge:
17767 return &MacroAssembler::Fcmge;
17768 case gt:
17769 return &MacroAssembler::Fcmgt;
17770 case le:
17771 return &MacroAssembler::Fcmle;
17772 case lt:
17773 return &MacroAssembler::Fcmlt;
17774 case eq:
17775 return &MacroAssembler::Fcmeq;
17776 case ne:
17777 return &MacroAssembler::Fcmne;
17778 case uo:
17779 return &MacroAssembler::Fcmuo;
17780 default:
17781 VIXL_UNIMPLEMENTED();
17782 return NULL;
17783 }
17784}
17785
TatWai Chonge3775132020-02-16 22:13:17 -080017786static FCmpZeroFn GetFpCompareZeroFn(Condition cond) {
17787 switch (cond) {
17788 case ge:
17789 return &MacroAssembler::Fcmge;
17790 case gt:
17791 return &MacroAssembler::Fcmgt;
17792 case le:
17793 return &MacroAssembler::Fcmle;
17794 case lt:
17795 return &MacroAssembler::Fcmlt;
17796 case eq:
17797 return &MacroAssembler::Fcmeq;
17798 case ne:
17799 return &MacroAssembler::Fcmne;
17800 default:
17801 VIXL_UNIMPLEMENTED();
17802 return NULL;
17803 }
17804}
17805
TatWai Chong47c26842020-02-10 01:51:32 -080017806static CmpFn GetIntCompareFn(Condition cond) {
17807 switch (cond) {
17808 case ge:
17809 return &MacroAssembler::Cmpge;
17810 case gt:
17811 return &MacroAssembler::Cmpgt;
17812 case le:
17813 return &MacroAssembler::Cmple;
17814 case lt:
17815 return &MacroAssembler::Cmplt;
17816 case eq:
17817 return &MacroAssembler::Cmpeq;
17818 case ne:
17819 return &MacroAssembler::Cmpne;
17820 default:
17821 VIXL_UNIMPLEMENTED();
17822 return NULL;
17823 }
17824}
17825
17826template <size_t N>
17827static void TestFpCompareHelper(Test* config,
17828 int lane_size_in_bits,
17829 Condition cond,
17830 const double (&zn_inputs)[N],
17831 const double (&zm_inputs)[N],
17832 const int (&pd_expected)[N],
17833 bool is_absolute = false) {
17834 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17835 START();
17836
17837 ZRegister zt_int_1 = z1.WithLaneSize(lane_size_in_bits);
17838 ZRegister zt_int_2 = z2.WithLaneSize(lane_size_in_bits);
17839 ZRegister zt_int_3 = z3.WithLaneSize(lane_size_in_bits);
17840 ZRegister zt_fp_1 = z11.WithLaneSize(lane_size_in_bits);
17841 ZRegister zt_fp_2 = z12.WithLaneSize(lane_size_in_bits);
17842 ZRegister zt_fp_3 = z13.WithLaneSize(lane_size_in_bits);
17843 ZRegister fp_one = z31.WithLaneSize(lane_size_in_bits);
17844
17845 PRegisterWithLaneSize pd_result_int_1 = p15.WithLaneSize(lane_size_in_bits);
17846 PRegisterWithLaneSize pd_result_fp_1 = p14.WithLaneSize(lane_size_in_bits);
17847 PRegisterWithLaneSize pd_result_int_2 = p13.WithLaneSize(lane_size_in_bits);
17848 PRegisterWithLaneSize pd_result_fp_2 = p12.WithLaneSize(lane_size_in_bits);
17849
17850 FCmpFn fcmp = is_absolute ? GetFpAbsCompareFn(cond) : GetFpCompareFn(cond);
17851 __ Ptrue(p1.VnB());
17852
17853 if (cond != uo) {
17854 int pg_inputs[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1};
17855 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
17856
17857 __ Fdup(fp_one, 0.1f);
17858
17859 __ Index(zt_int_1, 3, 3);
17860 __ Scvtf(zt_fp_1, p0.Merging(), zt_int_1);
17861 __ Fadd(zt_fp_1, zt_fp_1, fp_one);
17862
17863 __ Index(zt_int_2, 3, -10);
17864 __ Scvtf(zt_fp_2, p0.Merging(), zt_int_2);
17865 __ Fadd(zt_fp_2, zt_fp_2, fp_one);
17866
17867 __ Index(zt_int_3, 3, 2);
17868 __ Scvtf(zt_fp_3, p0.Merging(), zt_int_3);
17869 __ Fadd(zt_fp_3, zt_fp_3, fp_one);
17870
17871
17872 // There is no absolute comparison in integer type, use `abs` with `cmp<cc>`
17873 // to synthesize the expected result for `fac<cc>`.
17874 if (is_absolute == true) {
17875 __ Abs(zt_int_2, p1.Merging(), zt_int_2);
17876 }
17877
17878 CmpFn cmp = GetIntCompareFn(cond);
17879 (masm.*cmp)(pd_result_int_1, p0.Zeroing(), zt_int_1, zt_int_2);
17880 (masm.*fcmp)(pd_result_fp_1, p0.Zeroing(), zt_fp_1, zt_fp_2);
17881
17882 (masm.*cmp)(pd_result_int_2, p0.Zeroing(), zt_int_1, zt_int_3);
17883 (masm.*fcmp)(pd_result_fp_2, p0.Zeroing(), zt_fp_1, zt_fp_3);
17884 }
17885
17886 uint64_t zn_inputs_rawbits[N];
17887 uint64_t zm_inputs_rawbits[N];
17888 FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
17889 FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
17890
17891 ZRegister zn_fp = z14.WithLaneSize(lane_size_in_bits);
17892 ZRegister zm_fp = z15.WithLaneSize(lane_size_in_bits);
17893 InsrHelper(&masm, zn_fp, zn_inputs_rawbits);
17894 InsrHelper(&masm, zm_fp, zm_inputs_rawbits);
17895
17896 PRegisterWithLaneSize pd_result_fp_3 = p11.WithLaneSize(lane_size_in_bits);
17897 (masm.*fcmp)(pd_result_fp_3, p1.Zeroing(), zn_fp, zm_fp);
17898
17899 END();
17900
17901 if (CAN_RUN()) {
17902 RUN();
17903
17904 if (cond != uo) {
17905 ASSERT_EQUAL_SVE(pd_result_int_1, pd_result_fp_1);
17906 ASSERT_EQUAL_SVE(pd_result_int_2, pd_result_fp_2);
17907 }
17908 ASSERT_EQUAL_SVE(pd_expected, pd_result_fp_3);
17909 }
17910}
17911
17912TEST_SVE(sve_fp_compare_vectors) {
17913 double inf_p = kFP64PositiveInfinity;
17914 double inf_n = kFP64NegativeInfinity;
17915 double nan = kFP64DefaultNaN;
17916
17917 // Normal floating point comparison has been tested in the helper.
17918 double zn[] = {0.0, inf_n, 1.0, inf_p, inf_p, nan, 0.0, nan};
17919 double zm[] = {-0.0, inf_n, inf_n, -2.0, inf_n, nan, nan, inf_p};
17920
17921 int pd_fcm_gt[] = {0, 0, 1, 1, 1, 0, 0, 0};
17922 int pd_fcm_lt[] = {0, 0, 0, 0, 0, 0, 0, 0};
17923 int pd_fcm_ge[] = {1, 1, 1, 1, 1, 0, 0, 0};
17924 int pd_fcm_le[] = {1, 1, 0, 0, 0, 0, 0, 0};
17925 int pd_fcm_eq[] = {1, 1, 0, 0, 0, 0, 0, 0};
Jacob Bramley4606adc2020-07-02 14:23:08 +010017926 int pd_fcm_ne[] = {0, 0, 1, 1, 1, 1, 1, 1};
TatWai Chong47c26842020-02-10 01:51:32 -080017927 int pd_fcm_uo[] = {0, 0, 0, 0, 0, 1, 1, 1};
17928 int pd_fac_gt[] = {0, 0, 0, 1, 0, 0, 0, 0};
17929 int pd_fac_lt[] = {0, 0, 1, 0, 0, 0, 0, 0};
17930 int pd_fac_ge[] = {1, 1, 0, 1, 1, 0, 0, 0};
17931 int pd_fac_le[] = {1, 1, 1, 0, 1, 0, 0, 0};
17932
17933 int lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
17934
17935 for (size_t i = 0; i < ArrayLength(lane_sizes); i++) {
17936 int lane_size = lane_sizes[i];
17937 // Test floating-point compare vectors.
17938 TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fcm_gt);
17939 TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fcm_lt);
17940 TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fcm_ge);
17941 TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fcm_le);
17942 TestFpCompareHelper(config, lane_size, eq, zn, zm, pd_fcm_eq);
17943 TestFpCompareHelper(config, lane_size, ne, zn, zm, pd_fcm_ne);
17944 TestFpCompareHelper(config, lane_size, uo, zn, zm, pd_fcm_uo);
17945
17946 // Test floating-point absolute compare vectors.
17947 TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fac_gt, true);
17948 TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fac_lt, true);
17949 TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fac_ge, true);
17950 TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fac_le, true);
17951 }
17952}
17953
TatWai Chonge3775132020-02-16 22:13:17 -080017954template <size_t N, typename T>
17955static void TestFpCompareZeroHelper(Test* config,
17956 int lane_size_in_bits,
17957 Condition cond,
17958 const T (&zn_inputs)[N],
17959 const int (&pd_expected)[N]) {
17960 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17961 START();
17962
17963 ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
17964 PRegisterWithLaneSize pd = p14.WithLaneSize(lane_size_in_bits);
17965
17966 uint64_t zn_rawbits[N];
17967 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
17968 InsrHelper(&masm, zn, zn_rawbits);
17969
17970 __ Ptrue(p0.VnB());
Jacob Bramley5a5e71f2020-07-02 13:54:58 +010017971 (masm.*GetFpCompareZeroFn(cond))(pd, p0.Zeroing(), zn, 0.0);
TatWai Chonge3775132020-02-16 22:13:17 -080017972
17973 END();
17974
17975 if (CAN_RUN()) {
17976 RUN();
17977
17978 ASSERT_EQUAL_SVE(pd_expected, pd);
17979 }
17980}
17981
17982TEST_SVE(sve_fp_compare_vector_zero) {
17983 Float16 fp16_inf_p = kFP16PositiveInfinity;
17984 Float16 fp16_inf_n = kFP16NegativeInfinity;
17985 Float16 fp16_dn = kFP16DefaultNaN;
17986 Float16 fp16_sn = RawbitsToFloat16(0x7c22);
17987 Float16 fp16_qn = RawbitsToFloat16(0x7e55);
17988
17989 float fp32_inf_p = kFP32PositiveInfinity;
17990 float fp32_inf_n = kFP32NegativeInfinity;
17991 float fp32_dn = kFP32DefaultNaN;
17992 float fp32_sn = RawbitsToFloat(0x7f952222);
17993 float fp32_qn = RawbitsToFloat(0x7fea2222);
17994
17995 double fp64_inf_p = kFP64PositiveInfinity;
17996 double fp64_inf_n = kFP64NegativeInfinity;
17997 double fp64_dn = kFP64DefaultNaN;
17998 double fp64_sn = RawbitsToDouble(0x7ff5555511111111);
17999 double fp64_qn = RawbitsToDouble(0x7ffaaaaa11111111);
18000
18001 // Normal floating point comparison has been tested in the non-zero form.
18002 Float16 zn_inputs_h[] = {Float16(0.0),
18003 Float16(-0.0),
18004 fp16_inf_p,
18005 fp16_inf_n,
18006 fp16_dn,
18007 fp16_sn,
18008 fp16_qn};
18009 float zn_inputs_s[] =
18010 {0.0, -0.0, fp32_inf_p, fp32_inf_n, fp32_dn, fp32_sn, fp32_qn};
18011 double zn_inputs_d[] =
18012 {0.0, -0.0, fp64_inf_p, fp64_inf_n, fp64_dn, fp64_sn, fp64_qn};
18013
18014 int pd_expected_gt[] = {0, 0, 1, 0, 0, 0, 0};
18015 int pd_expected_lt[] = {0, 0, 0, 1, 0, 0, 0};
18016 int pd_expected_ge[] = {1, 1, 1, 0, 0, 0, 0};
18017 int pd_expected_le[] = {1, 1, 0, 1, 0, 0, 0};
18018 int pd_expected_eq[] = {1, 1, 0, 0, 0, 0, 0};
Jacob Bramley4606adc2020-07-02 14:23:08 +010018019 int pd_expected_ne[] = {0, 0, 1, 1, 1, 1, 1};
TatWai Chonge3775132020-02-16 22:13:17 -080018020
18021 TestFpCompareZeroHelper(config, kDRegSize, gt, zn_inputs_d, pd_expected_gt);
18022 TestFpCompareZeroHelper(config, kDRegSize, lt, zn_inputs_d, pd_expected_lt);
18023 TestFpCompareZeroHelper(config, kDRegSize, ge, zn_inputs_d, pd_expected_ge);
18024 TestFpCompareZeroHelper(config, kDRegSize, le, zn_inputs_d, pd_expected_le);
18025 TestFpCompareZeroHelper(config, kDRegSize, eq, zn_inputs_d, pd_expected_eq);
18026 TestFpCompareZeroHelper(config, kDRegSize, ne, zn_inputs_d, pd_expected_ne);
18027
18028 TestFpCompareZeroHelper(config, kSRegSize, gt, zn_inputs_s, pd_expected_gt);
18029 TestFpCompareZeroHelper(config, kSRegSize, lt, zn_inputs_s, pd_expected_lt);
18030 TestFpCompareZeroHelper(config, kSRegSize, ge, zn_inputs_s, pd_expected_ge);
18031 TestFpCompareZeroHelper(config, kSRegSize, le, zn_inputs_s, pd_expected_le);
18032 TestFpCompareZeroHelper(config, kSRegSize, eq, zn_inputs_s, pd_expected_eq);
18033 TestFpCompareZeroHelper(config, kSRegSize, ne, zn_inputs_s, pd_expected_ne);
18034
18035 TestFpCompareZeroHelper(config, kHRegSize, gt, zn_inputs_h, pd_expected_gt);
18036 TestFpCompareZeroHelper(config, kHRegSize, lt, zn_inputs_h, pd_expected_lt);
18037 TestFpCompareZeroHelper(config, kHRegSize, ge, zn_inputs_h, pd_expected_ge);
18038 TestFpCompareZeroHelper(config, kHRegSize, le, zn_inputs_h, pd_expected_le);
18039 TestFpCompareZeroHelper(config, kHRegSize, eq, zn_inputs_h, pd_expected_eq);
18040 TestFpCompareZeroHelper(config, kHRegSize, ne, zn_inputs_h, pd_expected_ne);
18041}
18042
TatWai Chong2cb1b612020-03-04 23:51:21 -080018043typedef void (MacroAssembler::*FPUnaryMFn)(const ZRegister& zd,
18044 const PRegisterM& pg,
18045 const ZRegister& zn);
18046
18047typedef void (MacroAssembler::*FPUnaryZFn)(const ZRegister& zd,
18048 const PRegisterZ& pg,
18049 const ZRegister& zn);
18050
18051template <size_t N, size_t M>
18052static void TestFPUnaryPredicatedHelper(Test* config,
18053 int src_size_in_bits,
18054 int dst_size_in_bits,
18055 uint64_t (&zn_inputs)[N],
18056 const uint64_t (&pg_inputs)[M],
18057 const uint64_t (&zd_expected)[N],
18058 FPUnaryMFn macro_m,
18059 FPUnaryZFn macro_z) {
18060 // Provide the full predicate input.
18061 VIXL_ASSERT(M == (kPRegMaxSize / kDRegSize));
18062 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18063 START();
18064
18065 int ds = dst_size_in_bits;
18066 int ss = src_size_in_bits;
18067 int ls = std::max(ss, ds);
18068
18069 // When destination type is larger than source type, fill the high parts with
18070 // noise values, which should be ignored.
18071 if (ds > ss) {
18072 VIXL_ASSERT(ss < 64);
18073 uint64_t zn_inputs_mod[N];
18074 uint64_t sn = GetSignallingNan(ss);
18075 for (unsigned i = 0; i < N; i++) {
18076 zn_inputs_mod[i] = zn_inputs[i] | ((sn + i) << ss);
18077 }
18078 InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs_mod);
18079 } else {
18080 InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs);
18081 }
18082
18083 // Make a copy so we can check that constructive operations preserve zn.
18084 __ Mov(z28, z29);
18085
18086 // Run the operation on all lanes.
18087 __ Ptrue(p0.WithLaneSize(ls));
18088 (masm.*macro_m)(z27.WithLaneSize(ds), p0.Merging(), z28.WithLaneSize(ss));
18089
18090 Initialise(&masm,
18091 p1.VnB(),
18092 pg_inputs[3],
18093 pg_inputs[2],
18094 pg_inputs[1],
18095 pg_inputs[0]);
18096
18097 // Clear the irrelevant lanes.
18098 __ Index(z31.WithLaneSize(ls), 0, 1);
18099 __ Cmplt(p1.WithLaneSize(ls), p1.Zeroing(), z31.WithLaneSize(ls), N);
18100
18101 // Check merging predication.
18102 __ Index(z11.WithLaneSize(ls), 42, 1);
18103 // Preserve the base value so we can derive the expected result.
18104 __ Mov(z21, z11);
18105 __ Mov(z9, z11);
18106 (masm.*macro_m)(z11.WithLaneSize(ds), p1.Merging(), z28.WithLaneSize(ss));
18107
18108 // Generate expected values using explicit merging operations.
18109 InsrHelper(&masm, z25.WithLaneSize(ls), zd_expected);
18110 __ Mov(z21.WithLaneSize(ls), p1.Merging(), z25.WithLaneSize(ls));
18111
18112 // Check zeroing predication.
18113 __ Index(z12.WithLaneSize(ds), 42, -1);
18114 (masm.*macro_z)(z12.WithLaneSize(ds), p1.Zeroing(), z28.WithLaneSize(ss));
18115
18116 // Generate expected values using explicit zeroing operations.
18117 InsrHelper(&masm, z30.WithLaneSize(ls), zd_expected);
18118 // Emulate zeroing predication.
18119 __ Dup(z22.WithLaneSize(ls), 0);
18120 __ Mov(z22.WithLaneSize(ls), p1.Merging(), z30.WithLaneSize(ls));
18121
18122 // Check an in-place update.
18123 __ Mov(z9.WithLaneSize(ls), p1.Merging(), z28.WithLaneSize(ls));
18124 (masm.*macro_m)(z9.WithLaneSize(ds), p1.Merging(), z9.WithLaneSize(ss));
18125
18126 END();
18127
18128 if (CAN_RUN()) {
18129 RUN();
18130
18131 // Check all lanes.
18132 ASSERT_EQUAL_SVE(zd_expected, z27.WithLaneSize(ls));
18133
18134 // Check that constructive operations preserve their inputs.
18135 ASSERT_EQUAL_SVE(z28, z29);
18136
18137 // Check merging predication.
18138 ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z21.WithLaneSize(ls));
18139
18140 // Check zeroing predication.
18141 ASSERT_EQUAL_SVE(z22.WithLaneSize(ls), z12.WithLaneSize(ls));
18142
18143 // Check in-place operation where zd == zn.
18144 ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z9.WithLaneSize(ls));
18145 }
18146}
18147
18148template <size_t N, typename T>
18149static void TestFPUnaryPredicatedHelper(Test* config,
18150 int src_size_in_bits,
18151 int dst_size_in_bits,
18152 T (&zn_inputs)[N],
18153 const T (&zd_expected)[N],
18154 FPUnaryMFn macro_m,
18155 FPUnaryZFn macro_z) {
18156 uint64_t pg_inputs[] = {0xa55aa55aa55aa55a,
18157 0xa55aa55aa55aa55a,
18158 0xa55aa55aa55aa55a,
18159 0xa55aa55aa55aa55a};
18160
18161 TestFPUnaryPredicatedHelper(config,
18162 src_size_in_bits,
18163 dst_size_in_bits,
18164 zn_inputs,
18165 pg_inputs,
18166 zd_expected,
18167 macro_m,
18168 macro_z);
18169
18170 // The complementary of above precicate to get full input coverage.
18171 uint64_t pg_c_inputs[] = {0x5aa55aa55aa55aa5,
18172 0x5aa55aa55aa55aa5,
18173 0x5aa55aa55aa55aa5,
18174 0x5aa55aa55aa55aa5};
18175
18176 TestFPUnaryPredicatedHelper(config,
18177 src_size_in_bits,
18178 dst_size_in_bits,
18179 zn_inputs,
18180 pg_c_inputs,
18181 zd_expected,
18182 macro_m,
18183 macro_z);
18184}
18185
18186template <size_t N, typename T>
18187static void TestFcvtHelper(Test* config,
18188 int src_size_in_bits,
18189 int dst_size_in_bits,
18190 T (&zn_inputs)[N],
18191 const T (&zd_expected)[N]) {
18192 TestFPUnaryPredicatedHelper(config,
18193 src_size_in_bits,
18194 dst_size_in_bits,
18195 zn_inputs,
18196 zd_expected,
18197 &MacroAssembler::Fcvt, // Merging form.
18198 &MacroAssembler::Fcvt); // Zerging form.
18199}
18200
18201TEST_SVE(sve_fcvt) {
18202 uint64_t h_vals[] = {0x7c00,
18203 0xfc00,
18204 0,
18205 0x8000,
18206 0x7bff, // Max half precision.
18207 0x0400, // Min positive normal.
18208 0x03ff, // Max subnormal.
18209 0x0001}; // Min positive subnormal.
18210
18211 uint64_t s_vals[] = {0x7f800000,
18212 0xff800000,
18213 0,
18214 0x80000000,
18215 0x477fe000,
18216 0x38800000,
18217 0x387fc000,
18218 0x33800000};
18219
18220 uint64_t d_vals[] = {0x7ff0000000000000,
18221 0xfff0000000000000,
18222 0,
18223 0x8000000000000000,
18224 0x40effc0000000000,
18225 0x3f10000000000000,
18226 0x3f0ff80000000000,
18227 0x3e70000000000000};
18228
18229 TestFcvtHelper(config, kHRegSize, kSRegSize, h_vals, s_vals);
18230 TestFcvtHelper(config, kSRegSize, kHRegSize, s_vals, h_vals);
18231 TestFcvtHelper(config, kSRegSize, kDRegSize, s_vals, d_vals);
18232 TestFcvtHelper(config, kDRegSize, kSRegSize, d_vals, s_vals);
18233 TestFcvtHelper(config, kHRegSize, kDRegSize, h_vals, d_vals);
18234 TestFcvtHelper(config, kDRegSize, kHRegSize, d_vals, h_vals);
18235}
18236
18237TEST_SVE(sve_fcvt_nan) {
18238 uint64_t h_inputs[] = {0x7e55, // Quiet NaN.
18239 0x7c22}; // Signalling NaN.
18240
18241 uint64_t h2s_expected[] = {0x7fcaa000, 0x7fc44000};
18242
18243 uint64_t h2d_expected[] = {0x7ff9540000000000, 0x7ff8880000000000};
18244
18245 uint64_t s_inputs[] = {0x7fc12345, // Quiet NaN.
18246 0x7f812345}; // Signalling NaN.
18247
18248 uint64_t s2h_expected[] = {0x7e09, 0x7e09};
18249
18250 uint64_t s2d_expected[] = {0x7ff82468a0000000, 0x7ff82468a0000000};
18251
18252 uint64_t d_inputs[] = {0x7ffaaaaa22222222, // Quiet NaN.
18253 0x7ff5555511111111}; // Signalling NaN.
18254
18255 uint64_t d2h_expected[] = {0x7eaa, 0x7f55};
18256
18257 uint64_t d2s_expected[] = {0x7fd55551, 0x7feaaaa8};
18258
18259 TestFcvtHelper(config, kHRegSize, kSRegSize, h_inputs, h2s_expected);
18260 TestFcvtHelper(config, kSRegSize, kHRegSize, s_inputs, s2h_expected);
18261 TestFcvtHelper(config, kHRegSize, kDRegSize, h_inputs, h2d_expected);
18262 TestFcvtHelper(config, kDRegSize, kHRegSize, d_inputs, d2h_expected);
18263 TestFcvtHelper(config, kSRegSize, kDRegSize, s_inputs, s2d_expected);
18264 TestFcvtHelper(config, kDRegSize, kSRegSize, d_inputs, d2s_expected);
18265}
18266
TatWai Chongf60f6dc2020-02-21 10:48:11 -080018267template <size_t N, typename T>
18268static void TestFrecpxHelper(Test* config,
18269 int lane_size_in_bits,
18270 T (&zn_inputs)[N],
18271 const T (&zd_expected)[N]) {
18272 TestFPUnaryPredicatedHelper(config,
18273 lane_size_in_bits,
18274 lane_size_in_bits,
18275 zn_inputs,
18276 zd_expected,
18277 &MacroAssembler::Frecpx, // Merging form.
18278 &MacroAssembler::Frecpx); // Zerging form.
18279}
18280
18281TEST_SVE(sve_frecpx_h) {
18282 uint64_t zn_inputs[] = {Float16ToRawbits(kFP16PositiveInfinity),
18283 Float16ToRawbits(kFP16NegativeInfinity),
18284 Float16ToRawbits(Float16(0.0)),
18285 Float16ToRawbits(Float16(-0.0)),
18286 0x0001, // Smallest positive subnormal number.
18287 0x03ff, // Largest subnormal number.
18288 0x0400, // Smallest positive normal number.
18289 0x7bff, // Largest normal number.
18290 0x3bff, // Largest number less than one.
18291 0x3c01, // Smallest number larger than one.
18292 0x7c22, // Signalling NaN.
18293 0x7e55}; // Quiet NaN.
18294
18295 uint64_t zd_expected[] = {0,
18296 0x8000,
18297 0x7800,
18298 0xf800,
18299 // Exponent of subnormal numbers are zero.
18300 0x7800,
18301 0x7800,
18302 0x7800,
18303 0x0400,
18304 0x4400,
18305 0x4000,
18306 0x7e22, // To quiet NaN.
18307 0x7e55};
18308
18309 TestFrecpxHelper(config, kHRegSize, zn_inputs, zd_expected);
18310}
18311
18312TEST_SVE(sve_frecpx_s) {
18313 uint64_t zn_inputs[] = {FloatToRawbits(kFP32PositiveInfinity),
18314 FloatToRawbits(kFP32NegativeInfinity),
18315 FloatToRawbits(65504), // Max half precision.
18316 FloatToRawbits(6.10352e-5), // Min positive normal.
18317 FloatToRawbits(6.09756e-5), // Max subnormal.
18318 FloatToRawbits(
18319 5.96046e-8), // Min positive subnormal.
18320 FloatToRawbits(5e-9), // Not representable -> zero.
18321 FloatToRawbits(-0.0),
18322 FloatToRawbits(0.0),
18323 0x7f952222, // Signalling NaN.
18324 0x7fea2222}; // Quiet NaN;
18325
18326 uint64_t zd_expected[] = {0, // 0.0
18327 0x80000000, // -0.0
18328 0x38800000, // 6.10352e-05
18329 0x47000000, // 32768
18330 0x47800000, // 65536
18331 0x4c800000, // 6.71089e+07
18332 0x4e000000, // 5.36871e+08
18333 0xff000000, // -1.70141e+38
18334 0x7f000000, // 1.70141e+38
18335 0x7fd52222,
18336 0x7fea2222};
18337
18338 TestFrecpxHelper(config, kSRegSize, zn_inputs, zd_expected);
18339}
18340
18341TEST_SVE(sve_frecpx_d) {
18342 uint64_t zn_inputs[] = {DoubleToRawbits(kFP64PositiveInfinity),
18343 DoubleToRawbits(kFP64NegativeInfinity),
18344 DoubleToRawbits(65504), // Max half precision.
18345 DoubleToRawbits(6.10352e-5), // Min positive normal.
18346 DoubleToRawbits(6.09756e-5), // Max subnormal.
18347 DoubleToRawbits(
18348 5.96046e-8), // Min positive subnormal.
18349 DoubleToRawbits(5e-9), // Not representable -> zero.
18350 DoubleToRawbits(-0.0),
18351 DoubleToRawbits(0.0),
18352 0x7ff5555511111111, // Signalling NaN.
18353 0x7ffaaaaa11111111}; // Quiet NaN;
18354
18355 uint64_t zd_expected[] = {0, // 0.0
18356 0x8000000000000000, // -0.0
18357 0x3f10000000000000, // 6.10352e-05
18358 0x40e0000000000000, // 32768
18359 0x40f0000000000000, // 65536
18360 0x4190000000000000, // 6.71089e+07
18361 0x41c0000000000000, // 5.36871e+08
18362 0xffe0000000000000, // -1.70141e+38
18363 0x7fe0000000000000, // 1.70141e+38
18364 0x7ffd555511111111,
18365 0x7ffaaaaa11111111};
18366
18367 TestFrecpxHelper(config, kDRegSize, zn_inputs, zd_expected);
18368}
TatWai Chong2cb1b612020-03-04 23:51:21 -080018369
TatWai Chongb4a25f62020-02-27 00:53:57 -080018370template <size_t N, typename T>
18371static void TestFsqrtHelper(Test* config,
18372 int lane_size_in_bits,
18373 T (&zn_inputs)[N],
18374 const T (&zd_expected)[N]) {
18375 TestFPUnaryPredicatedHelper(config,
18376 lane_size_in_bits,
18377 lane_size_in_bits,
18378 zn_inputs,
18379 zd_expected,
18380 &MacroAssembler::Fsqrt, // Merging form.
18381 &MacroAssembler::Fsqrt); // Zerging form.
18382}
18383
18384TEST_SVE(sve_fsqrt_h) {
18385 uint64_t zn_inputs[] =
18386 {Float16ToRawbits(Float16(0.0)),
18387 Float16ToRawbits(Float16(-0.0)),
18388 Float16ToRawbits(Float16(1.0)),
18389 Float16ToRawbits(Float16(65025.0)),
18390 Float16ToRawbits(kFP16PositiveInfinity),
18391 Float16ToRawbits(kFP16NegativeInfinity),
18392 Float16ToRawbits(Float16(6.10352e-5)), // Min normal positive.
18393 Float16ToRawbits(Float16(65504.0)), // Max normal positive float.
18394 Float16ToRawbits(Float16(6.09756e-5)), // Max subnormal.
18395 Float16ToRawbits(Float16(5.96046e-8)), // Min subnormal positive.
18396 0x7c22, // Signaling NaN
18397 0x7e55}; // Quiet NaN
18398
18399 uint64_t zd_expected[] = {Float16ToRawbits(Float16(0.0)),
18400 Float16ToRawbits(Float16(-0.0)),
18401 Float16ToRawbits(Float16(1.0)),
18402 Float16ToRawbits(Float16(255.0)),
18403 Float16ToRawbits(kFP16PositiveInfinity),
18404 Float16ToRawbits(kFP16DefaultNaN),
18405 0x2000,
18406 0x5bff,
18407 0x1fff,
18408 0x0c00,
18409 0x7e22, // To quiet NaN.
18410 0x7e55};
18411
18412 TestFsqrtHelper(config, kHRegSize, zn_inputs, zd_expected);
18413}
18414
18415TEST_SVE(sve_fsqrt_s) {
18416 uint64_t zn_inputs[] = {FloatToRawbits(0.0f),
18417 FloatToRawbits(-0.0f),
18418 FloatToRawbits(1.0f),
18419 FloatToRawbits(65536.0f),
18420 FloatToRawbits(kFP32PositiveInfinity),
18421 FloatToRawbits(kFP32NegativeInfinity),
18422 0x00800000, // Min normal positive, ~1.17e−38
18423 0x7f7fffff, // Max normal positive, ~3.40e+38
18424 0x00000001, // Min subnormal positive, ~1.40e−45
18425 0x007fffff, // Max subnormal, ~1.17e−38
18426 0x7f951111, // Signaling NaN
18427 0x7fea1111}; // Quiet NaN
18428
18429 uint64_t zd_expected[] = {FloatToRawbits(0.0f),
18430 FloatToRawbits(-0.0f),
18431 FloatToRawbits(1.0f),
18432 FloatToRawbits(256.0f),
18433 FloatToRawbits(kFP32PositiveInfinity),
18434 FloatToRawbits(kFP32DefaultNaN),
18435 0x20000000, // ~1.08e-19
18436 0x5f7fffff, // ~1.84e+19
18437 0x1a3504f3, // ~3.74e-23
18438 0x1fffffff, // ~1.08e-19
18439 0x7fd51111, // To quiet NaN.
18440 0x7fea1111};
18441
18442 TestFsqrtHelper(config, kSRegSize, zn_inputs, zd_expected);
18443}
18444
18445TEST_SVE(sve_fsqrt_d) {
18446 uint64_t zn_inputs[] =
18447 {DoubleToRawbits(0.0),
18448 DoubleToRawbits(-0.0),
18449 DoubleToRawbits(1.0),
18450 DoubleToRawbits(65536.0),
18451 DoubleToRawbits(kFP64PositiveInfinity),
18452 DoubleToRawbits(kFP64NegativeInfinity),
18453 0x0010000000000000, // Min normal positive, ~2.22e-308
18454 0x7fefffffffffffff, // Max normal positive, ~1.79e+308
18455 0x0000000000000001, // Min subnormal positive, 5e-324
18456 0x000fffffffffffff, // Max subnormal, ~2.22e-308
18457 0x7ff5555511111111,
18458 0x7ffaaaaa11111111};
18459
18460 uint64_t zd_expected[] = {DoubleToRawbits(0.0),
18461 DoubleToRawbits(-0.0),
18462 DoubleToRawbits(1.0),
18463 DoubleToRawbits(256.0),
18464 DoubleToRawbits(kFP64PositiveInfinity),
18465 DoubleToRawbits(kFP64DefaultNaN),
18466 0x2000000000000000, // ~1.49e-154
18467 0x5fefffffffffffff, // ~1.34e+154
18468 0x1e60000000000000, // ~2.22e-162
18469 0x1fffffffffffffff, // ~1.49e-154
18470 0x7ffd555511111111, // To quiet NaN.
18471 0x7ffaaaaa11111111};
18472
18473 TestFsqrtHelper(config, kDRegSize, zn_inputs, zd_expected);
18474}
18475
Martyn Capewell48522f52020-03-16 15:31:19 +000018476TEST_SVE(sve_adr) {
18477 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18478 START();
18479
18480 __ Index(z0.VnD(), 0x10000000f0000000, 0x1000);
18481 __ Index(z1.VnD(), 1, 3);
18482 __ Index(z2.VnS(), -1, -1);
18483 __ Adr(z3.VnD(), SVEMemOperand(z0.VnD(), z1.VnD()));
18484 __ Adr(z4.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 1));
18485 __ Adr(z5.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 2));
18486 __ Adr(z6.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 3));
18487 __ Adr(z7.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW));
18488 __ Adr(z8.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 1));
18489 __ Adr(z9.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 2));
18490 __ Adr(z10.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 3));
18491 __ Adr(z11.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW));
18492 __ Adr(z12.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 1));
18493 __ Adr(z13.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 2));
18494 __ Adr(z14.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 3));
18495 __ Adr(z15.VnS(), SVEMemOperand(z0.VnS(), z2.VnS()));
18496 __ Adr(z16.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 1));
18497 __ Adr(z17.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 2));
18498 __ Adr(z18.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 3));
18499
18500 END();
18501
18502 if (CAN_RUN()) {
18503 RUN();
18504 uint64_t expected_z3[] = {0x10000000f0001004, 0x10000000f0000001};
18505 uint64_t expected_z4[] = {0x10000000f0001008, 0x10000000f0000002};
18506 uint64_t expected_z5[] = {0x10000000f0001010, 0x10000000f0000004};
18507 uint64_t expected_z6[] = {0x10000000f0001020, 0x10000000f0000008};
18508 uint64_t expected_z7[] = {0x10000001f0000ffd, 0x10000001efffffff};
18509 uint64_t expected_z8[] = {0x10000002f0000ffa, 0x10000002effffffe};
18510 uint64_t expected_z9[] = {0x10000004f0000ff4, 0x10000004effffffc};
18511 uint64_t expected_z10[] = {0x10000008f0000fe8, 0x10000008effffff8};
18512 uint64_t expected_z11[] = {0x10000000f0000ffd, 0x10000000efffffff};
18513 uint64_t expected_z12[] = {0x10000000f0000ffa, 0x10000000effffffe};
18514 uint64_t expected_z13[] = {0x10000000f0000ff4, 0x10000000effffffc};
18515 uint64_t expected_z14[] = {0x10000000f0000fe8, 0x10000000effffff8};
18516 uint64_t expected_z15[] = {0x0ffffffcf0000ffd, 0x0ffffffeefffffff};
18517 uint64_t expected_z16[] = {0x0ffffff8f0000ffa, 0x0ffffffceffffffe};
18518 uint64_t expected_z17[] = {0x0ffffff0f0000ff4, 0x0ffffff8effffffc};
18519 uint64_t expected_z18[] = {0x0fffffe0f0000fe8, 0x0ffffff0effffff8};
18520
18521 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
18522 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
18523 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
18524 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
18525 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
18526 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
18527 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
18528 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
18529 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
18530 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
18531 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
18532 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
18533 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
18534 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
18535 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
18536 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
18537 }
18538}
18539
TatWai Chong85e15102020-05-04 21:00:40 -070018540// Test loads and broadcast by comparing them with the result of a set of
18541// equivalent scalar loads.
18542template <typename F>
18543static void LoadBcastHelper(Test* config,
18544 unsigned msize_in_bits,
18545 unsigned esize_in_bits,
18546 F sve_ld1,
18547 bool is_signed) {
18548 VIXL_ASSERT((esize_in_bits == kBRegSize) || (esize_in_bits == kHRegSize) ||
18549 (esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
18550 static const unsigned kMaxLaneCount = kZRegMaxSize / kBRegSize;
18551
18552 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18553 START();
18554
18555 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
18556 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
18557 int vl = config->sve_vl_in_bytes();
18558
18559 uint64_t offsets[kMaxLaneCount];
18560 uint64_t buffer_size = vl * 64;
18561 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
18562 BufferFillingHelper(data,
18563 buffer_size,
18564 msize_in_bytes,
18565 kMaxLaneCount,
18566 offsets);
18567
18568 for (unsigned i = 0; i < (kMaxLaneCount / 2); i++) {
18569 // Assign encodable offsets into the first part of the offset array so
18570 // that both encodable and unencodable offset can be tested.
18571 // Note that the encoding bit range of immediate offset is 6 bits.
18572 offsets[i] = (offsets[i] % (UINT64_C(1) << 6)) * msize_in_bytes;
18573 }
18574
18575 ZRegister zn = z0.WithLaneSize(esize_in_bits);
18576 ZRegister zn_ref = z4.WithLaneSize(esize_in_bits);
18577
18578 PRegisterZ pg = p0.Zeroing();
18579 Initialise(&masm,
18580 pg,
18581 0x9abcdef012345678,
18582 0xabcdef0123456789,
18583 0xf4f3f1f0fefdfcfa,
18584 0xf9f8f6f5f3f2f0ff);
18585
18586 __ Mov(x2, data);
18587 uint64_t enablable_offset = offsets[0];
18588 // Simple check if the operation correct in a single offset.
18589 (masm.*sve_ld1)(zn, pg, SVEMemOperand(x2, enablable_offset));
18590
18591 // Generate a reference result using scalar loads.
18592 uint64_t address = data + enablable_offset;
18593 uint64_t duplicated_addresses[kMaxLaneCount];
18594 for (unsigned i = 0; i < kMaxLaneCount; i++) {
18595 duplicated_addresses[i] = address;
18596 }
18597
18598 ScalarLoadHelper(&masm,
18599 vl,
18600 duplicated_addresses,
18601 zn_ref,
18602 pg,
18603 esize_in_bits,
18604 msize_in_bits,
18605 is_signed);
18606
18607 ZRegister zn_agg = z10.WithLaneSize(esize_in_bits);
18608 ZRegister zn_agg_ref = z11.WithLaneSize(esize_in_bits);
18609 ZRegister zn_temp = z12.WithLaneSize(esize_in_bits);
18610
18611 __ Dup(zn_agg, 0);
18612 __ Dup(zn_agg_ref, 0);
18613
18614 // Check if the operation correct in different offsets.
18615 for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
18616 (masm.*sve_ld1)(zn_temp, pg, SVEMemOperand(x2, offsets[i]));
18617 __ Lastb(x1, pg, zn_temp);
18618 __ Insr(zn_agg, x1);
18619
18620 __ Mov(x3, data + offsets[i]);
18621 ScalarLoadHelper(&masm, x1, x3, msize_in_bits, is_signed);
18622 __ Insr(zn_agg_ref, x1);
18623 }
18624
18625 END();
18626
18627 if (CAN_RUN()) {
18628 RUN();
18629
18630 ASSERT_EQUAL_SVE(zn_ref, zn);
18631 ASSERT_EQUAL_SVE(zn_agg_ref, zn_agg);
18632 }
18633
18634 free(reinterpret_cast<void*>(data));
18635}
18636
18637TEST_SVE(sve_ld1rb) {
18638 LoadBcastHelper(config, kBRegSize, kBRegSize, &MacroAssembler::Ld1rb, false);
18639 LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rb, false);
18640 LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rb, false);
18641 LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rb, false);
18642}
18643
18644TEST_SVE(sve_ld1rh) {
18645 LoadBcastHelper(config, kHRegSize, kHRegSize, &MacroAssembler::Ld1rh, false);
18646 LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rh, false);
18647 LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rh, false);
18648}
18649
18650TEST_SVE(sve_ld1rw) {
18651 LoadBcastHelper(config, kSRegSize, kSRegSize, &MacroAssembler::Ld1rw, false);
18652 LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rw, false);
18653}
18654
18655TEST_SVE(sve_ld1rd) {
18656 LoadBcastHelper(config, kDRegSize, kDRegSize, &MacroAssembler::Ld1rd, false);
18657}
18658
18659TEST_SVE(sve_ld1rsb) {
18660 LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rsb, true);
18661 LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rsb, true);
18662 LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rsb, true);
18663}
18664
18665TEST_SVE(sve_ld1rsh) {
18666 LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rsh, true);
18667 LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rsh, true);
18668}
18669
18670TEST_SVE(sve_ld1rsw) {
18671 LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rsw, true);
18672}
18673
TatWai Chong3db2c492020-03-29 22:20:41 -070018674TEST_SVE(sve_prefetch_offset) {
18675 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18676
18677 START();
18678
18679 __ Prfb(PLDL1KEEP, p5, SVEMemOperand(z30.VnS(), 0));
18680 __ Prfb(PLDL1STRM, p5, SVEMemOperand(x28, -11, SVE_MUL_VL));
Martyn Capewellecca4b12020-07-02 14:30:50 +010018681 __ Prfb(PLDL2KEEP, p6, SVEMemOperand(x30, x29));
TatWai Chong3db2c492020-03-29 22:20:41 -070018682 __ Prfb(PLDL2STRM, p6, SVEMemOperand(x7, z12.VnS(), UXTW));
18683 __ Prfh(PSTL2KEEP, p6, SVEMemOperand(z0.VnS(), 28));
18684 __ Prfh(PSTL2STRM, p4, SVEMemOperand(x17, -3, SVE_MUL_VL));
18685 __ Prfh(PSTL3KEEP, p3, SVEMemOperand(x0, x0));
18686 __ Prfh(PSTL3STRM, p4, SVEMemOperand(x20, z0.VnD()));
18687 __ Prfw(PLDL1KEEP, p3, SVEMemOperand(z23.VnD(), 5));
18688 __ Prfw(PLDL1STRM, p1, SVEMemOperand(x4, 10, SVE_MUL_VL));
18689 __ Prfw(PLDL2KEEP, p2, SVEMemOperand(x22, x22));
18690 __ Prfw(PLDL2STRM, p1, SVEMemOperand(x2, z6.VnS(), SXTW));
18691 __ Prfd(PLDL3KEEP, p5, SVEMemOperand(z11.VnD(), 9));
18692 __ Prfd(PLDL3STRM, p3, SVEMemOperand(x0, -24, SVE_MUL_VL));
18693 __ Prfd(PSTL1KEEP, p7, SVEMemOperand(x5, x5));
18694 __ Prfd(PSTL1STRM, p1, SVEMemOperand(x19, z18.VnS(), SXTW));
18695
18696 END();
18697 if (CAN_RUN()) {
18698 RUN();
18699 }
18700}
18701
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000018702} // namespace aarch64
18703} // namespace vixl