blob: 03ff7b006664828fc8a03b12a2ce6f978c0062ad [file] [log] [blame]
Jacob Bramleyd77a8e42019-02-12 16:52:24 +00001// Copyright 2019, VIXL authors
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are met:
6//
7// * Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// * Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// * Neither the name of ARM Limited nor the names of its contributors may be
13// used to endorse or promote products derived from this software without
14// specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27#include <sys/mman.h>
Jacob Bramley85a9c102019-12-09 17:48:29 +000028#include <unistd.h>
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000029
30#include <cfloat>
31#include <cmath>
32#include <cstdio>
33#include <cstdlib>
34#include <cstring>
TatWai Chong1af34f12020-06-01 20:54:06 -070035#include <functional>
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000036
37#include "test-runner.h"
38#include "test-utils.h"
39#include "aarch64/test-utils-aarch64.h"
40
41#include "aarch64/cpu-aarch64.h"
42#include "aarch64/disasm-aarch64.h"
43#include "aarch64/macro-assembler-aarch64.h"
44#include "aarch64/simulator-aarch64.h"
45#include "test-assembler-aarch64.h"
46
Martyn Capewelldba51cc2020-08-27 13:48:26 +010047#define TEST_SVE(name) TEST_SVE_INNER("ASM", name)
48
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000049namespace vixl {
50namespace aarch64 {
51
Jacob Bramley03c0b512019-02-22 16:42:06 +000052// Call masm->Insr repeatedly to allow test inputs to be set up concisely. This
53// is optimised for call-site clarity, not generated code quality, so it doesn't
54// exist in the MacroAssembler itself.
55//
56// Usage:
57//
58// int values[] = { 42, 43, 44 };
59// InsrHelper(&masm, z0.VnS(), values); // Sets z0.S = { ..., 42, 43, 44 }
60//
61// The rightmost (highest-indexed) array element maps to the lowest-numbered
62// lane.
63template <typename T, size_t N>
64void InsrHelper(MacroAssembler* masm,
65 const ZRegister& zdn,
66 const T (&values)[N]) {
67 for (size_t i = 0; i < N; i++) {
68 masm->Insr(zdn, values[i]);
69 }
70}
71
Jacob Bramley0ce75842019-07-17 18:12:50 +010072// Conveniently initialise P registers with scalar bit patterns. The destination
73// lane size is ignored. This is optimised for call-site clarity, not generated
74// code quality.
Jacob Bramley2eaecf12019-05-01 15:46:34 +010075//
76// Usage:
77//
Jacob Bramley0ce75842019-07-17 18:12:50 +010078// Initialise(&masm, p0, 0x1234); // Sets p0 = 0b'0001'0010'0011'0100
Jacob Bramley2eaecf12019-05-01 15:46:34 +010079void Initialise(MacroAssembler* masm,
Jacob Bramley0ce75842019-07-17 18:12:50 +010080 const PRegister& pd,
81 uint64_t value3,
82 uint64_t value2,
83 uint64_t value1,
84 uint64_t value0) {
85 // Generate a literal pool, as in the array form.
Jacob Bramley2eaecf12019-05-01 15:46:34 +010086 UseScratchRegisterScope temps(masm);
87 Register temp = temps.AcquireX();
88 Label data;
89 Label done;
90
Jacob Bramley2eaecf12019-05-01 15:46:34 +010091 masm->Adr(temp, &data);
Jacob Bramley66e66712019-08-02 17:45:32 +010092 masm->Ldr(pd, SVEMemOperand(temp));
Jacob Bramley2eaecf12019-05-01 15:46:34 +010093 masm->B(&done);
94 {
95 ExactAssemblyScope total(masm, kPRegMaxSizeInBytes);
96 masm->bind(&data);
Jacob Bramley0ce75842019-07-17 18:12:50 +010097 masm->dc64(value0);
98 masm->dc64(value1);
99 masm->dc64(value2);
100 masm->dc64(value3);
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100101 }
102 masm->Bind(&done);
103}
Jacob Bramley0ce75842019-07-17 18:12:50 +0100104void Initialise(MacroAssembler* masm,
105 const PRegister& pd,
106 uint64_t value2,
107 uint64_t value1,
108 uint64_t value0) {
109 Initialise(masm, pd, 0, value2, value1, value0);
110}
111void Initialise(MacroAssembler* masm,
112 const PRegister& pd,
113 uint64_t value1,
114 uint64_t value0) {
115 Initialise(masm, pd, 0, 0, value1, value0);
116}
117void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value0) {
118 Initialise(masm, pd, 0, 0, 0, value0);
119}
120
121// Conveniently initialise P registers by lane. This is optimised for call-site
122// clarity, not generated code quality.
123//
124// Usage:
125//
126// int values[] = { 0x0, 0x1, 0x2 };
127// Initialise(&masm, p0.VnS(), values); // Sets p0 = 0b'0000'0001'0010
128//
129// The rightmost (highest-indexed) array element maps to the lowest-numbered
130// lane. Unspecified lanes are set to 0 (inactive).
131//
132// Each element of the `values` array is mapped onto a lane in `pd`. The
133// architecture only respects the lower bit, and writes zero the upper bits, but
134// other (encodable) values can be specified if required by the test.
135template <typename T, size_t N>
136void Initialise(MacroAssembler* masm,
137 const PRegisterWithLaneSize& pd,
138 const T (&values)[N]) {
139 // Turn the array into 64-bit chunks.
140 uint64_t chunks[4] = {0, 0, 0, 0};
141 VIXL_STATIC_ASSERT(sizeof(chunks) == kPRegMaxSizeInBytes);
142
143 int p_bits_per_lane = pd.GetLaneSizeInBits() / kZRegBitsPerPRegBit;
144 VIXL_ASSERT((64 % p_bits_per_lane) == 0);
145 VIXL_ASSERT((N * p_bits_per_lane) <= kPRegMaxSize);
146
147 uint64_t p_lane_mask = GetUintMask(p_bits_per_lane);
148
149 VIXL_STATIC_ASSERT(N <= kPRegMaxSize);
150 size_t bit = 0;
151 for (int n = static_cast<int>(N - 1); n >= 0; n--) {
152 VIXL_ASSERT(bit < (sizeof(chunks) * kBitsPerByte));
153 uint64_t value = values[n] & p_lane_mask;
154 chunks[bit / 64] |= value << (bit % 64);
155 bit += p_bits_per_lane;
156 }
157
158 Initialise(masm, pd, chunks[3], chunks[2], chunks[1], chunks[0]);
159}
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100160
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000161// Ensure that basic test infrastructure works.
Jacob Bramleye8289202019-07-31 11:25:23 +0100162TEST_SVE(sve_test_infrastructure_z) {
163 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000164 START();
165
Jacob Bramley03c0b512019-02-22 16:42:06 +0000166 __ Mov(x0, 0x0123456789abcdef);
167
168 // Test basic `Insr` behaviour.
169 __ Insr(z0.VnB(), 1);
170 __ Insr(z0.VnB(), 2);
171 __ Insr(z0.VnB(), x0);
172 __ Insr(z0.VnB(), -42);
173 __ Insr(z0.VnB(), 0);
174
175 // Test array inputs.
176 int z1_inputs[] = {3, 4, 5, -42, 0};
177 InsrHelper(&masm, z1.VnH(), z1_inputs);
178
179 // Test that sign-extension works as intended for various lane sizes.
180 __ Dup(z2.VnD(), 0); // Clear the register first.
181 __ Insr(z2.VnB(), -42); // 0xd6
182 __ Insr(z2.VnB(), 0xfe); // 0xfe
183 __ Insr(z2.VnH(), -42); // 0xffd6
184 __ Insr(z2.VnH(), 0xfedc); // 0xfedc
185 __ Insr(z2.VnS(), -42); // 0xffffffd6
186 __ Insr(z2.VnS(), 0xfedcba98); // 0xfedcba98
187 // Use another register for VnD(), so we can support 128-bit Z registers.
188 __ Insr(z3.VnD(), -42); // 0xffffffffffffffd6
189 __ Insr(z3.VnD(), 0xfedcba9876543210); // 0xfedcba9876543210
190
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000191 END();
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000192
Jacob Bramley119bd212019-04-16 10:13:09 +0100193 if (CAN_RUN()) {
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100194 RUN();
Jacob Bramley03c0b512019-02-22 16:42:06 +0000195
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100196 // Test that array checks work properly on a register initialised
197 // lane-by-lane.
198 int z0_inputs_b[] = {0x01, 0x02, 0xef, 0xd6, 0x00};
199 ASSERT_EQUAL_SVE(z0_inputs_b, z0.VnB());
Jacob Bramley03c0b512019-02-22 16:42:06 +0000200
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100201 // Test that lane-by-lane checks work properly on a register initialised
202 // by array.
203 for (size_t i = 0; i < ArrayLength(z1_inputs); i++) {
204 // The rightmost (highest-indexed) array element maps to the
205 // lowest-numbered lane.
206 int lane = static_cast<int>(ArrayLength(z1_inputs) - i - 1);
207 ASSERT_EQUAL_SVE_LANE(z1_inputs[i], z1.VnH(), lane);
Jacob Bramley03c0b512019-02-22 16:42:06 +0000208 }
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100209
210 uint64_t z2_inputs_d[] = {0x0000d6feffd6fedc, 0xffffffd6fedcba98};
211 ASSERT_EQUAL_SVE(z2_inputs_d, z2.VnD());
212 uint64_t z3_inputs_d[] = {0xffffffffffffffd6, 0xfedcba9876543210};
213 ASSERT_EQUAL_SVE(z3_inputs_d, z3.VnD());
Jacob Bramley119bd212019-04-16 10:13:09 +0100214 }
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000215}
216
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100217// Ensure that basic test infrastructure works.
Jacob Bramleye8289202019-07-31 11:25:23 +0100218TEST_SVE(sve_test_infrastructure_p) {
219 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100220 START();
221
222 // Simple cases: move boolean (0 or 1) values.
223
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100224 int p0_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100225 Initialise(&masm, p0.VnB(), p0_inputs);
226
227 int p1_inputs[] = {1, 0, 1, 1, 0, 1, 1, 1};
228 Initialise(&masm, p1.VnH(), p1_inputs);
229
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100230 int p2_inputs[] = {1, 1, 0, 1};
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100231 Initialise(&masm, p2.VnS(), p2_inputs);
232
233 int p3_inputs[] = {0, 1};
234 Initialise(&masm, p3.VnD(), p3_inputs);
235
236 // Advanced cases: move numeric value into architecturally-ignored bits.
237
238 // B-sized lanes get one bit in a P register, so there are no ignored bits.
239
240 // H-sized lanes get two bits in a P register.
241 int p4_inputs[] = {0x3, 0x2, 0x1, 0x0, 0x1, 0x2, 0x3};
242 Initialise(&masm, p4.VnH(), p4_inputs);
243
244 // S-sized lanes get four bits in a P register.
245 int p5_inputs[] = {0xc, 0x7, 0x9, 0x6, 0xf};
246 Initialise(&masm, p5.VnS(), p5_inputs);
247
248 // D-sized lanes get eight bits in a P register.
249 int p6_inputs[] = {0x81, 0xcc, 0x55};
250 Initialise(&masm, p6.VnD(), p6_inputs);
251
252 // The largest possible P register has 32 bytes.
253 int p7_inputs[] = {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
254 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
255 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
256 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f};
257 Initialise(&masm, p7.VnD(), p7_inputs);
258
259 END();
260
261 if (CAN_RUN()) {
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100262 RUN();
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100263
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100264 // Test that lane-by-lane checks work properly. The rightmost
265 // (highest-indexed) array element maps to the lowest-numbered lane.
266 for (size_t i = 0; i < ArrayLength(p0_inputs); i++) {
267 int lane = static_cast<int>(ArrayLength(p0_inputs) - i - 1);
268 ASSERT_EQUAL_SVE_LANE(p0_inputs[i], p0.VnB(), lane);
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100269 }
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100270 for (size_t i = 0; i < ArrayLength(p1_inputs); i++) {
271 int lane = static_cast<int>(ArrayLength(p1_inputs) - i - 1);
272 ASSERT_EQUAL_SVE_LANE(p1_inputs[i], p1.VnH(), lane);
273 }
274 for (size_t i = 0; i < ArrayLength(p2_inputs); i++) {
275 int lane = static_cast<int>(ArrayLength(p2_inputs) - i - 1);
276 ASSERT_EQUAL_SVE_LANE(p2_inputs[i], p2.VnS(), lane);
277 }
278 for (size_t i = 0; i < ArrayLength(p3_inputs); i++) {
279 int lane = static_cast<int>(ArrayLength(p3_inputs) - i - 1);
280 ASSERT_EQUAL_SVE_LANE(p3_inputs[i], p3.VnD(), lane);
281 }
282
283 // Test that array checks work properly on predicates initialised with a
284 // possibly-different lane size.
285 // 0b...11'10'01'00'01'10'11
286 int p4_expected[] = {0x39, 0x1b};
287 ASSERT_EQUAL_SVE(p4_expected, p4.VnD());
288
289 ASSERT_EQUAL_SVE(p5_inputs, p5.VnS());
290
291 // 0b...10000001'11001100'01010101
292 int p6_expected[] = {2, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1};
293 ASSERT_EQUAL_SVE(p6_expected, p6.VnH());
294
295 // 0b...10011100'10011101'10011110'10011111
296 int p7_expected[] = {1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
297 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1};
298 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100299 }
300}
301
Jacob Bramley935b15b2019-07-04 14:09:22 +0100302// Test that writes to V registers clear the high bits of the corresponding Z
303// register.
Jacob Bramleye8289202019-07-31 11:25:23 +0100304TEST_SVE(sve_v_write_clear) {
305 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON,
306 CPUFeatures::kFP,
307 CPUFeatures::kSVE);
Jacob Bramley935b15b2019-07-04 14:09:22 +0100308 START();
309
310 // The Simulator has two mechansisms for writing V registers:
311 // - Write*Register, calling through to SimRegisterBase::Write.
312 // - LogicVRegister::ClearForWrite followed by one or more lane updates.
313 // Try to cover both variants.
314
315 // Prepare some known inputs.
316 uint8_t data[kQRegSizeInBytes];
317 for (size_t i = 0; i < kQRegSizeInBytes; i++) {
318 data[i] = 42 + i;
319 }
320 __ Mov(x10, reinterpret_cast<uintptr_t>(data));
321 __ Fmov(d30, 42.0);
322
Jacob Bramley199339d2019-08-05 18:49:13 +0100323 // Use Index to label the lane indices, so failures are easy to detect and
Jacob Bramley935b15b2019-07-04 14:09:22 +0100324 // diagnose.
325 __ Index(z0.VnB(), 0, 1);
326 __ Index(z1.VnB(), 0, 1);
327 __ Index(z2.VnB(), 0, 1);
328 __ Index(z3.VnB(), 0, 1);
329 __ Index(z4.VnB(), 0, 1);
330
331 __ Index(z10.VnB(), 0, -1);
332 __ Index(z11.VnB(), 0, -1);
333 __ Index(z12.VnB(), 0, -1);
334 __ Index(z13.VnB(), 0, -1);
335 __ Index(z14.VnB(), 0, -1);
336
337 // Instructions using Write*Register (and SimRegisterBase::Write).
338 __ Ldr(b0, MemOperand(x10));
339 __ Fcvt(h1, d30);
340 __ Fmov(s2, 1.5f);
341 __ Fmov(d3, d30);
342 __ Ldr(q4, MemOperand(x10));
343
344 // Instructions using LogicVRegister::ClearForWrite.
345 // These also (incidentally) test that across-lane instructions correctly
346 // ignore the high-order Z register lanes.
347 __ Sminv(b10, v10.V16B());
348 __ Addv(h11, v11.V4H());
349 __ Saddlv(s12, v12.V8H());
350 __ Dup(v13.V8B(), b13, kDRegSizeInBytes);
351 __ Uaddl(v14.V8H(), v14.V8B(), v14.V8B());
352
353 END();
354
355 if (CAN_RUN()) {
356 RUN();
357
358 // Check the Q part first.
359 ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000002a, v0);
360 ASSERT_EQUAL_128(0x0000000000000000, 0x0000000000005140, v1); // 42.0 (f16)
361 ASSERT_EQUAL_128(0x0000000000000000, 0x000000003fc00000, v2); // 1.5 (f32)
362 ASSERT_EQUAL_128(0x0000000000000000, 0x4045000000000000, v3); // 42.0 (f64)
363 ASSERT_EQUAL_128(0x3938373635343332, 0x31302f2e2d2c2b2a, v4);
364 ASSERT_EQUAL_128(0x0000000000000000, 0x00000000000000f1, v10); // -15
365 // 0xf9fa + 0xfbfc + 0xfdfe + 0xff00 -> 0xf2f4
366 ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000f2f4, v11);
367 // 0xfffff1f2 + 0xfffff3f4 + ... + 0xfffffdfe + 0xffffff00 -> 0xffffc6c8
368 ASSERT_EQUAL_128(0x0000000000000000, 0x00000000ffffc6c8, v12);
369 ASSERT_EQUAL_128(0x0000000000000000, 0xf8f8f8f8f8f8f8f8, v13); // [-8] x 8
370 // [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
371 // + [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
372 // -> [0x01f2, 0x01f4, 0x01f6, 0x01f8, 0x01fa, 0x01fc, 0x01fe, 0x0000]
373 ASSERT_EQUAL_128(0x01f201f401f601f8, 0x01fa01fc01fe0000, v14);
374
375 // Check that the upper lanes are all clear.
376 for (int i = kQRegSizeInBytes; i < core.GetSVELaneCount(kBRegSize); i++) {
377 ASSERT_EQUAL_SVE_LANE(0x00, z0.VnB(), i);
378 ASSERT_EQUAL_SVE_LANE(0x00, z1.VnB(), i);
379 ASSERT_EQUAL_SVE_LANE(0x00, z2.VnB(), i);
380 ASSERT_EQUAL_SVE_LANE(0x00, z3.VnB(), i);
381 ASSERT_EQUAL_SVE_LANE(0x00, z4.VnB(), i);
382 ASSERT_EQUAL_SVE_LANE(0x00, z10.VnB(), i);
383 ASSERT_EQUAL_SVE_LANE(0x00, z11.VnB(), i);
384 ASSERT_EQUAL_SVE_LANE(0x00, z12.VnB(), i);
385 ASSERT_EQUAL_SVE_LANE(0x00, z13.VnB(), i);
386 ASSERT_EQUAL_SVE_LANE(0x00, z14.VnB(), i);
387 }
388 }
389}
390
Jacob Bramleye8289202019-07-31 11:25:23 +0100391static void MlaMlsHelper(Test* config, unsigned lane_size_in_bits) {
392 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley22023df2019-05-14 17:55:43 +0100393 START();
394
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100395 int zd_inputs[] = {0xbb, 0xcc, 0xdd, 0xee};
Jacob Bramley22023df2019-05-14 17:55:43 +0100396 int za_inputs[] = {-39, 1, -3, 2};
397 int zn_inputs[] = {-5, -20, 9, 8};
398 int zm_inputs[] = {9, -5, 4, 5};
399
400 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
401 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
402 ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
403 ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
404
405 // TODO: Use a simple `Dup` once it accepts arbitrary immediates.
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100406 InsrHelper(&masm, zd, zd_inputs);
Jacob Bramley22023df2019-05-14 17:55:43 +0100407 InsrHelper(&masm, za, za_inputs);
408 InsrHelper(&masm, zn, zn_inputs);
409 InsrHelper(&masm, zm, zm_inputs);
410
411 int p0_inputs[] = {1, 1, 0, 1};
412 int p1_inputs[] = {1, 0, 1, 1};
413 int p2_inputs[] = {0, 1, 1, 1};
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100414 int p3_inputs[] = {1, 1, 1, 0};
Jacob Bramley22023df2019-05-14 17:55:43 +0100415
416 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), p0_inputs);
417 Initialise(&masm, p1.WithLaneSize(lane_size_in_bits), p1_inputs);
418 Initialise(&masm, p2.WithLaneSize(lane_size_in_bits), p2_inputs);
419 Initialise(&masm, p3.WithLaneSize(lane_size_in_bits), p3_inputs);
420
421 // The Mla macro automatically selects between mla, mad and movprfx + mla
422 // based on what registers are aliased.
423 ZRegister mla_da_result = z10.WithLaneSize(lane_size_in_bits);
424 ZRegister mla_dn_result = z11.WithLaneSize(lane_size_in_bits);
425 ZRegister mla_dm_result = z12.WithLaneSize(lane_size_in_bits);
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100426 ZRegister mla_d_result = z13.WithLaneSize(lane_size_in_bits);
Jacob Bramley22023df2019-05-14 17:55:43 +0100427
428 __ Mov(mla_da_result, za);
429 __ Mla(mla_da_result, p0.Merging(), mla_da_result, zn, zm);
430
431 __ Mov(mla_dn_result, zn);
432 __ Mla(mla_dn_result, p1.Merging(), za, mla_dn_result, zm);
433
434 __ Mov(mla_dm_result, zm);
435 __ Mla(mla_dm_result, p2.Merging(), za, zn, mla_dm_result);
436
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100437 __ Mov(mla_d_result, zd);
438 __ Mla(mla_d_result, p3.Merging(), za, zn, zm);
Jacob Bramley22023df2019-05-14 17:55:43 +0100439
440 // The Mls macro automatically selects between mls, msb and movprfx + mls
441 // based on what registers are aliased.
442 ZRegister mls_da_result = z20.WithLaneSize(lane_size_in_bits);
443 ZRegister mls_dn_result = z21.WithLaneSize(lane_size_in_bits);
444 ZRegister mls_dm_result = z22.WithLaneSize(lane_size_in_bits);
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100445 ZRegister mls_d_result = z23.WithLaneSize(lane_size_in_bits);
Jacob Bramley22023df2019-05-14 17:55:43 +0100446
447 __ Mov(mls_da_result, za);
448 __ Mls(mls_da_result, p0.Merging(), mls_da_result, zn, zm);
449
450 __ Mov(mls_dn_result, zn);
451 __ Mls(mls_dn_result, p1.Merging(), za, mls_dn_result, zm);
452
453 __ Mov(mls_dm_result, zm);
454 __ Mls(mls_dm_result, p2.Merging(), za, zn, mls_dm_result);
455
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100456 __ Mov(mls_d_result, zd);
457 __ Mls(mls_d_result, p3.Merging(), za, zn, zm);
Jacob Bramley22023df2019-05-14 17:55:43 +0100458
459 END();
460
461 if (CAN_RUN()) {
462 RUN();
463
464 ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
465 ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits));
466 ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits));
467
468 int mla[] = {-84, 101, 33, 42};
469 int mls[] = {6, -99, -39, -38};
470
471 int mla_da_expected[] = {mla[0], mla[1], za_inputs[2], mla[3]};
472 ASSERT_EQUAL_SVE(mla_da_expected, mla_da_result);
473
474 int mla_dn_expected[] = {mla[0], zn_inputs[1], mla[2], mla[3]};
475 ASSERT_EQUAL_SVE(mla_dn_expected, mla_dn_result);
476
477 int mla_dm_expected[] = {zm_inputs[0], mla[1], mla[2], mla[3]};
478 ASSERT_EQUAL_SVE(mla_dm_expected, mla_dm_result);
479
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100480 int mla_d_expected[] = {mla[0], mla[1], mla[2], zd_inputs[3]};
481 ASSERT_EQUAL_SVE(mla_d_expected, mla_d_result);
Jacob Bramley22023df2019-05-14 17:55:43 +0100482
483 int mls_da_expected[] = {mls[0], mls[1], za_inputs[2], mls[3]};
484 ASSERT_EQUAL_SVE(mls_da_expected, mls_da_result);
485
486 int mls_dn_expected[] = {mls[0], zn_inputs[1], mls[2], mls[3]};
487 ASSERT_EQUAL_SVE(mls_dn_expected, mls_dn_result);
488
489 int mls_dm_expected[] = {zm_inputs[0], mls[1], mls[2], mls[3]};
490 ASSERT_EQUAL_SVE(mls_dm_expected, mls_dm_result);
491
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100492 int mls_d_expected[] = {mls[0], mls[1], mls[2], zd_inputs[3]};
493 ASSERT_EQUAL_SVE(mls_d_expected, mls_d_result);
Jacob Bramley22023df2019-05-14 17:55:43 +0100494 }
495}
496
Jacob Bramleye8289202019-07-31 11:25:23 +0100497TEST_SVE(sve_mla_mls_b) { MlaMlsHelper(config, kBRegSize); }
498TEST_SVE(sve_mla_mls_h) { MlaMlsHelper(config, kHRegSize); }
499TEST_SVE(sve_mla_mls_s) { MlaMlsHelper(config, kSRegSize); }
500TEST_SVE(sve_mla_mls_d) { MlaMlsHelper(config, kDRegSize); }
Jacob Bramley22023df2019-05-14 17:55:43 +0100501
Jacob Bramleye8289202019-07-31 11:25:23 +0100502TEST_SVE(sve_bitwise_unpredicate_logical) {
503 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chongcfb94212019-05-16 13:30:09 -0700504 START();
505
506 uint64_t z8_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
507 InsrHelper(&masm, z8.VnD(), z8_inputs);
508 uint64_t z15_inputs[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff};
509 InsrHelper(&masm, z15.VnD(), z15_inputs);
510
511 __ And(z1.VnD(), z8.VnD(), z15.VnD());
512 __ Bic(z2.VnD(), z8.VnD(), z15.VnD());
513 __ Eor(z3.VnD(), z8.VnD(), z15.VnD());
514 __ Orr(z4.VnD(), z8.VnD(), z15.VnD());
515
516 END();
517
518 if (CAN_RUN()) {
519 RUN();
520 uint64_t z1_expected[] = {0xfedcaa8854540000, 0x0000454588aacdef};
521 uint64_t z2_expected[] = {0x0000101022003210, 0x0123002201010000};
522 uint64_t z3_expected[] = {0x01235476ab89fedc, 0xcdef98ba67453210};
523 uint64_t z4_expected[] = {0xfffffefeffddfedc, 0xcdefddffefefffff};
524
525 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
526 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
527 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
528 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
529 }
TatWai Chongcfb94212019-05-16 13:30:09 -0700530}
531
Martyn Capewellf804b602020-02-24 18:57:18 +0000532TEST_SVE(sve_last_r) {
533 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
534 START();
535
536 __ Pfalse(p1.VnB());
537 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
538 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
539 Initialise(&masm, p2.VnB(), p2_inputs);
540 Initialise(&masm, p3.VnB(), p3_inputs);
541 __ Ptrue(p4.VnB());
542
543 __ Index(z0.VnB(), 0x10, 1);
544 __ Lasta(x1, p1, z0.VnB());
545 __ Lastb(x2, p1, z0.VnB());
546 __ Lasta(x3, p2, z0.VnB());
547 __ Lastb(x4, p2, z0.VnB());
548 __ Lasta(x5, p3, z0.VnB());
549 __ Lastb(x6, p3, z0.VnB());
550 __ Lasta(x7, p4, z0.VnB());
551
552 __ Punpklo(p3.VnH(), p3.VnB());
553 __ Index(z0.VnH(), 0x1110, 1);
554 __ Lasta(x9, p1, z0.VnH());
555 __ Lastb(x10, p3, z0.VnH());
556 __ Lasta(x12, p4, z0.VnH());
557
558 __ Index(z0.VnS(), 0x11111110, 1);
559 __ Lastb(x13, p1, z0.VnS());
560 __ Lasta(x14, p2, z0.VnS());
561 __ Lastb(x18, p4, z0.VnS());
562
563 __ Index(z0.VnD(), 0x1111111111111110, 1);
564 __ Lasta(x19, p1, z0.VnD());
565 __ Lastb(x20, p3, z0.VnD());
566 __ Lasta(x21, p3, z0.VnD());
567 END();
568
569 if (CAN_RUN()) {
570 RUN();
571
572 ASSERT_EQUAL_64(0x0000000000000010, x1);
573 ASSERT_EQUAL_64(0x0000000000000011, x3);
574 ASSERT_EQUAL_64(0x0000000000000010, x4);
575 ASSERT_EQUAL_64(0x0000000000000019, x5);
576 ASSERT_EQUAL_64(0x0000000000000018, x6);
577 ASSERT_EQUAL_64(0x0000000000000010, x7);
578 ASSERT_EQUAL_64(0x0000000000001110, x9);
579 ASSERT_EQUAL_64(0x0000000000001110, x12);
580 ASSERT_EQUAL_64(0x0000000011111111, x14);
581 ASSERT_EQUAL_64(0x1111111111111110, x19);
582
583 int vl = core.GetSVELaneCount(kBRegSize) * 8;
584 switch (vl) {
585 case 128:
586 ASSERT_EQUAL_64(0x000000000000001f, x2);
587 ASSERT_EQUAL_64(0x0000000000001116, x10);
588 ASSERT_EQUAL_64(0x0000000011111113, x13);
589 ASSERT_EQUAL_64(0x0000000011111113, x18);
590 ASSERT_EQUAL_64(0x1111111111111111, x20);
591 ASSERT_EQUAL_64(0x1111111111111110, x21);
592 break;
593 case 384:
594 ASSERT_EQUAL_64(0x000000000000003f, x2);
595 ASSERT_EQUAL_64(0x0000000000001118, x10);
596 ASSERT_EQUAL_64(0x000000001111111b, x13);
597 ASSERT_EQUAL_64(0x000000001111111b, x18);
598 ASSERT_EQUAL_64(0x1111111111111112, x20);
599 ASSERT_EQUAL_64(0x1111111111111113, x21);
600 break;
601 case 2048:
602 ASSERT_EQUAL_64(0x000000000000000f, x2);
603 ASSERT_EQUAL_64(0x0000000000001118, x10);
604 ASSERT_EQUAL_64(0x000000001111114f, x13);
605 ASSERT_EQUAL_64(0x000000001111114f, x18);
606 ASSERT_EQUAL_64(0x1111111111111112, x20);
607 ASSERT_EQUAL_64(0x1111111111111113, x21);
608 break;
609 default:
610 printf("WARNING: Some tests skipped due to unexpected VL.\n");
611 break;
612 }
613 }
614}
615
616TEST_SVE(sve_last_v) {
617 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
618 START();
619
620 __ Pfalse(p1.VnB());
621 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
622 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
623 Initialise(&masm, p2.VnB(), p2_inputs);
624 Initialise(&masm, p3.VnB(), p3_inputs);
625 __ Ptrue(p4.VnB());
626
627 __ Index(z0.VnB(), 0x10, 1);
628 __ Lasta(b1, p1, z0.VnB());
629 __ Lastb(b2, p1, z0.VnB());
630 __ Lasta(b3, p2, z0.VnB());
631 __ Lastb(b4, p2, z0.VnB());
632 __ Lasta(b5, p3, z0.VnB());
633 __ Lastb(b6, p3, z0.VnB());
634 __ Lasta(b7, p4, z0.VnB());
635
636 __ Punpklo(p3.VnH(), p3.VnB());
637 __ Index(z0.VnH(), 0x1110, 1);
638 __ Lasta(h9, p1, z0.VnH());
639 __ Lastb(h10, p3, z0.VnH());
640 __ Lasta(h12, p4, z0.VnH());
641
642 __ Index(z0.VnS(), 0x11111110, 1);
643 __ Lastb(s13, p1, z0.VnS());
644 __ Lasta(s14, p2, z0.VnS());
645 __ Lastb(s18, p4, z0.VnS());
646
647 __ Index(z0.VnD(), 0x1111111111111110, 1);
648 __ Lasta(d19, p1, z0.VnD());
649 __ Lastb(d20, p3, z0.VnD());
650 __ Lasta(d21, p3, z0.VnD());
651 END();
652
653 if (CAN_RUN()) {
654 RUN();
655
656 ASSERT_EQUAL_128(0, 0x0000000000000010, q1);
657 ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
658 ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
659 ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
660 ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
661 ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
662 ASSERT_EQUAL_128(0, 0x0000000000001110, q9);
663 ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
664 ASSERT_EQUAL_128(0, 0x0000000011111111, q14);
665 ASSERT_EQUAL_128(0, 0x1111111111111110, q19);
666
667 int vl = core.GetSVELaneCount(kBRegSize) * 8;
668 switch (vl) {
669 case 128:
670 ASSERT_EQUAL_128(0, 0x000000000000001f, q2);
671 ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
672 ASSERT_EQUAL_128(0, 0x0000000011111113, q13);
673 ASSERT_EQUAL_128(0, 0x0000000011111113, q18);
674 ASSERT_EQUAL_128(0, 0x1111111111111111, q20);
675 ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
676 break;
677 case 384:
678 ASSERT_EQUAL_128(0, 0x000000000000003f, q2);
679 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
680 ASSERT_EQUAL_128(0, 0x000000001111111b, q13);
681 ASSERT_EQUAL_128(0, 0x000000001111111b, q18);
682 ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
683 ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
684 break;
685 case 2048:
686 ASSERT_EQUAL_128(0, 0x000000000000000f, q2);
687 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
688 ASSERT_EQUAL_128(0, 0x000000001111114f, q13);
689 ASSERT_EQUAL_128(0, 0x000000001111114f, q18);
690 ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
691 ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
692 break;
693 default:
694 printf("WARNING: Some tests skipped due to unexpected VL.\n");
695 break;
696 }
697 }
698}
699
700TEST_SVE(sve_clast_r) {
701 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
702 START();
703
704 __ Pfalse(p1.VnB());
705 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
706 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
707 Initialise(&masm, p2.VnB(), p2_inputs);
708 Initialise(&masm, p3.VnB(), p3_inputs);
709 __ Ptrue(p4.VnB());
710
711 __ Index(z0.VnB(), 0x10, 1);
712 __ Mov(x1, -1);
713 __ Mov(x2, -1);
714 __ Clasta(x1, p1, x1, z0.VnB());
715 __ Clastb(x2, p1, x2, z0.VnB());
716 __ Clasta(x3, p2, x3, z0.VnB());
717 __ Clastb(x4, p2, x4, z0.VnB());
718 __ Clasta(x5, p3, x5, z0.VnB());
719 __ Clastb(x6, p3, x6, z0.VnB());
720 __ Clasta(x7, p4, x7, z0.VnB());
721
722 __ Punpklo(p3.VnH(), p3.VnB());
723 __ Index(z0.VnH(), 0x1110, 1);
724 __ Mov(x9, -1);
725 __ Clasta(x9, p1, x9, z0.VnH());
726 __ Clastb(x10, p3, x10, z0.VnH());
727 __ Clasta(x12, p4, x12, z0.VnH());
728
729 __ Index(z0.VnS(), 0x11111110, 1);
730 __ Mov(x13, -1);
731 __ Clasta(x13, p1, x13, z0.VnS());
732 __ Clastb(x14, p2, x14, z0.VnS());
733 __ Clasta(x18, p4, x18, z0.VnS());
734
735 __ Index(z0.VnD(), 0x1111111111111110, 1);
736 __ Mov(x19, -1);
737 __ Clasta(x19, p1, x19, z0.VnD());
738 __ Clastb(x20, p2, x20, z0.VnD());
739 __ Clasta(x21, p4, x21, z0.VnD());
740 END();
741
742 if (CAN_RUN()) {
743 RUN();
744 ASSERT_EQUAL_64(0x00000000000000ff, x1);
745 ASSERT_EQUAL_64(0x00000000000000ff, x2);
746 ASSERT_EQUAL_64(0x0000000000000011, x3);
747 ASSERT_EQUAL_64(0x0000000000000010, x4);
748 ASSERT_EQUAL_64(0x0000000000000019, x5);
749 ASSERT_EQUAL_64(0x0000000000000018, x6);
750 ASSERT_EQUAL_64(0x0000000000000010, x7);
751 ASSERT_EQUAL_64(0x000000000000ffff, x9);
752 ASSERT_EQUAL_64(0x0000000000001110, x12);
753 ASSERT_EQUAL_64(0x00000000ffffffff, x13);
754 ASSERT_EQUAL_64(0x0000000011111110, x14);
755 ASSERT_EQUAL_64(0x0000000011111110, x18);
756 ASSERT_EQUAL_64(0xffffffffffffffff, x19);
757 ASSERT_EQUAL_64(0x1111111111111110, x20);
758 ASSERT_EQUAL_64(0x1111111111111110, x21);
759
760 int vl = core.GetSVELaneCount(kBRegSize) * 8;
761 switch (vl) {
762 case 128:
763 ASSERT_EQUAL_64(0x0000000000001116, x10);
764 break;
765 case 384:
766 ASSERT_EQUAL_64(0x0000000000001118, x10);
767 break;
768 case 2048:
769 ASSERT_EQUAL_64(0x0000000000001118, x10);
770 break;
771 default:
772 printf("WARNING: Some tests skipped due to unexpected VL.\n");
773 break;
774 }
775 }
776}
777
778TEST_SVE(sve_clast_v) {
779 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
780 START();
781
782 __ Pfalse(p1.VnB());
783 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
784 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
785 Initialise(&masm, p2.VnB(), p2_inputs);
786 Initialise(&masm, p3.VnB(), p3_inputs);
787 __ Ptrue(p4.VnB());
788
789 __ Index(z0.VnB(), 0x10, 1);
790 __ Dup(z1.VnB(), -1);
791 __ Dup(z2.VnB(), -1);
792 __ Clasta(b1, p1, b1, z0.VnB());
793 __ Clastb(b2, p1, b2, z0.VnB());
794 __ Clasta(b3, p2, b3, z0.VnB());
795 __ Clastb(b4, p2, b4, z0.VnB());
796 __ Clasta(b5, p3, b5, z0.VnB());
797 __ Clastb(b6, p3, b6, z0.VnB());
798 __ Clasta(b7, p4, b7, z0.VnB());
799
800 __ Punpklo(p3.VnH(), p3.VnB());
801 __ Index(z0.VnH(), 0x1110, 1);
802 __ Dup(z9.VnB(), -1);
803 __ Clasta(h9, p1, h9, z0.VnH());
804 __ Clastb(h10, p3, h10, z0.VnH());
805 __ Clasta(h12, p4, h12, z0.VnH());
806
807 __ Index(z0.VnS(), 0x11111110, 1);
808 __ Dup(z13.VnB(), -1);
809 __ Clasta(s13, p1, s13, z0.VnS());
810 __ Clastb(s14, p2, s14, z0.VnS());
811 __ Clasta(s18, p4, s18, z0.VnS());
812
813 __ Index(z0.VnD(), 0x1111111111111110, 1);
814 __ Dup(z19.VnB(), -1);
815 __ Clasta(d19, p1, d19, z0.VnD());
816 __ Clastb(d20, p2, d20, z0.VnD());
817 __ Clasta(d21, p4, d21, z0.VnD());
818 END();
819
820 if (CAN_RUN()) {
821 RUN();
822 ASSERT_EQUAL_128(0, 0x00000000000000ff, q1);
823 ASSERT_EQUAL_128(0, 0x00000000000000ff, q2);
824 ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
825 ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
826 ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
827 ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
828 ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
829 ASSERT_EQUAL_128(0, 0x000000000000ffff, q9);
830 ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
831 ASSERT_EQUAL_128(0, 0x00000000ffffffff, q13);
832 ASSERT_EQUAL_128(0, 0x0000000011111110, q14);
833 ASSERT_EQUAL_128(0, 0x0000000011111110, q18);
834 ASSERT_EQUAL_128(0, 0xffffffffffffffff, q19);
835 ASSERT_EQUAL_128(0, 0x1111111111111110, q20);
836 ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
837
838 int vl = core.GetSVELaneCount(kBRegSize) * 8;
839 switch (vl) {
840 case 128:
841 ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
842 break;
843 case 384:
844 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
845 break;
846 case 2048:
847 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
848 break;
849 default:
850 printf("WARNING: Some tests skipped due to unexpected VL.\n");
851 break;
852 }
853 }
854}
855
856TEST_SVE(sve_clast_z) {
857 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
858 START();
859
860 __ Pfalse(p1.VnB());
861 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
862 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
863 Initialise(&masm, p2.VnB(), p2_inputs);
864 Initialise(&masm, p3.VnB(), p3_inputs);
865 __ Ptrue(p4.VnB());
866
867 __ Index(z0.VnB(), 0x10, 1);
868 __ Dup(z1.VnB(), 0xff);
869 __ Dup(z2.VnB(), 0xff);
870 __ Clasta(z1.VnB(), p1, z1.VnB(), z0.VnB());
871 __ Clastb(z2.VnB(), p1, z2.VnB(), z0.VnB());
872 __ Clasta(z3.VnB(), p2, z3.VnB(), z0.VnB());
873 __ Clastb(z4.VnB(), p2, z4.VnB(), z0.VnB());
874 __ Clasta(z5.VnB(), p3, z5.VnB(), z0.VnB());
875 __ Clastb(z6.VnB(), p3, z6.VnB(), z0.VnB());
876 __ Clasta(z7.VnB(), p4, z7.VnB(), z0.VnB());
877
878 __ Punpklo(p3.VnH(), p3.VnB());
879 __ Index(z0.VnH(), 0x1110, 1);
880 __ Dup(z9.VnB(), 0xff);
881 __ Clasta(z9.VnH(), p1, z9.VnH(), z0.VnH());
882 __ Clastb(z10.VnH(), p3, z10.VnH(), z0.VnH());
883 __ Clasta(z12.VnH(), p4, z12.VnH(), z0.VnH());
884
885 __ Index(z0.VnS(), 0x11111110, 1);
886 __ Dup(z13.VnB(), 0xff);
887 __ Clasta(z13.VnS(), p1, z13.VnS(), z0.VnS());
888 __ Clastb(z14.VnS(), p2, z14.VnS(), z0.VnS());
889 __ Clasta(z16.VnS(), p4, z16.VnS(), z0.VnS());
890
891 __ Index(z0.VnD(), 0x1111111111111110, 1);
892 __ Dup(z17.VnB(), 0xff);
893 __ Clasta(z17.VnD(), p1, z17.VnD(), z0.VnD());
894 __ Clastb(z18.VnD(), p2, z18.VnD(), z0.VnD());
895 __ Clasta(z20.VnD(), p4, z20.VnD(), z0.VnD());
896 END();
897
898 if (CAN_RUN()) {
899 RUN();
900 uint64_t z1_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
901 uint64_t z2_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
902 uint64_t z3_expected[] = {0x1111111111111111, 0x1111111111111111};
903 uint64_t z4_expected[] = {0x1010101010101010, 0x1010101010101010};
904 uint64_t z5_expected[] = {0x1919191919191919, 0x1919191919191919};
905 uint64_t z6_expected[] = {0x1818181818181818, 0x1818181818181818};
906 uint64_t z7_expected[] = {0x1010101010101010, 0x1010101010101010};
907 uint64_t z9_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
908 uint64_t z12_expected[] = {0x1110111011101110, 0x1110111011101110};
909 uint64_t z13_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
910 uint64_t z14_expected[] = {0x1111111011111110, 0x1111111011111110};
911 uint64_t z16_expected[] = {0x1111111011111110, 0x1111111011111110};
912 uint64_t z17_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
913 uint64_t z18_expected[] = {0x1111111111111110, 0x1111111111111110};
914 uint64_t z20_expected[] = {0x1111111111111110, 0x1111111111111110};
915
916 uint64_t z10_expected_vl128[] = {0x1116111611161116, 0x1116111611161116};
917 uint64_t z10_expected_vl_long[] = {0x1118111811181118, 0x1118111811181118};
918
919 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
920 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
921 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
922 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
923 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
924 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
925 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
926 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
927 ASSERT_EQUAL_SVE(z12_expected, z12.VnD());
928 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
929 ASSERT_EQUAL_SVE(z14_expected, z14.VnD());
930 ASSERT_EQUAL_SVE(z16_expected, z16.VnD());
931 ASSERT_EQUAL_SVE(z17_expected, z17.VnD());
932 ASSERT_EQUAL_SVE(z18_expected, z18.VnD());
933 ASSERT_EQUAL_SVE(z20_expected, z20.VnD());
934
935 int vl = core.GetSVELaneCount(kBRegSize) * 8;
936 switch (vl) {
937 case 128:
938 ASSERT_EQUAL_SVE(z10_expected_vl128, z10.VnD());
939 break;
940 case 384:
941 case 2048:
942 ASSERT_EQUAL_SVE(z10_expected_vl_long, z10.VnD());
943 break;
944 default:
945 printf("WARNING: Some tests skipped due to unexpected VL.\n");
946 break;
947 }
948 }
949}
950
951TEST_SVE(sve_compact) {
952 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
953 START();
954
955 __ Ptrue(p0.VnB());
956 __ Pfalse(p1.VnB());
957 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
958 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
959 __ Zip1(p4.VnD(), p0.VnD(), p1.VnD());
960
961 __ Index(z0.VnS(), 0x11111111, 0x11111111);
962 __ Mov(q0, q0);
963 __ Compact(z1.VnS(), p0, z0.VnS());
964 __ Compact(z2.VnS(), p2, z0.VnS());
965 __ Compact(z0.VnS(), p3, z0.VnS());
966
967 __ Index(z3.VnD(), 0x1111111111111111, 0x1111111111111111);
968 __ Mov(q3, q3);
969 __ Compact(z4.VnD(), p0, z3.VnD());
970 __ Compact(z5.VnD(), p1, z3.VnD());
971 __ Compact(z6.VnD(), p4, z3.VnD());
972
973 END();
974
975 if (CAN_RUN()) {
976 RUN();
977 uint64_t z1_expected[] = {0x4444444433333333, 0x2222222211111111};
978 uint64_t z2_expected[] = {0x0000000000000000, 0x3333333311111111};
979 uint64_t z0_expected[] = {0x0000000000000000, 0x4444444422222222};
980 uint64_t z4_expected[] = {0x2222222222222222, 0x1111111111111111};
981 uint64_t z5_expected[] = {0x0000000000000000, 0x0000000000000000};
982 uint64_t z6_expected[] = {0x0000000000000000, 0x1111111111111111};
983 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
984 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
985 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
986 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
987 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
988 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
989 }
990}
991
992TEST_SVE(sve_splice) {
993 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
994 START();
995
996 __ Ptrue(p0.VnB());
997 __ Pfalse(p1.VnB());
998 int p2b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
999 int p3b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
1000 int p4b_inputs[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1001 int p5b_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0};
1002 int p6b_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0};
1003 Initialise(&masm, p2.VnB(), p2b_inputs);
1004 Initialise(&masm, p3.VnB(), p3b_inputs);
1005 Initialise(&masm, p4.VnB(), p4b_inputs);
1006 Initialise(&masm, p5.VnB(), p5b_inputs);
1007 Initialise(&masm, p6.VnB(), p6b_inputs);
1008
1009 __ Index(z30.VnB(), 1, 1);
1010
1011 __ Index(z0.VnB(), -1, -1);
1012 __ Splice(z0.VnB(), p0, z0.VnB(), z30.VnB());
1013 __ Index(z1.VnB(), -1, -1);
1014 __ Splice(z1.VnB(), p1, z1.VnB(), z30.VnB());
1015 __ Index(z2.VnB(), -1, -1);
1016 __ Splice(z2.VnB(), p2, z2.VnB(), z30.VnB());
1017 __ Index(z3.VnB(), -1, -1);
1018 __ Splice(z3.VnB(), p3, z3.VnB(), z30.VnB());
1019 __ Index(z4.VnB(), -1, -1);
1020 __ Splice(z4.VnB(), p4, z4.VnB(), z30.VnB());
1021 __ Index(z5.VnB(), -1, -1);
1022 __ Splice(z5.VnB(), p5, z5.VnB(), z30.VnB());
1023 __ Index(z6.VnB(), -1, -1);
1024 __ Splice(z6.VnB(), p6, z6.VnB(), z30.VnB());
1025
1026 int p2h_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0};
1027 int p3h_inputs[] = {0, 0, 1, 0, 0, 0, 1, 0};
1028 Initialise(&masm, p2.VnH(), p2h_inputs);
1029 Initialise(&masm, p3.VnH(), p3h_inputs);
1030
1031 __ Index(z30.VnH(), 1, 1);
1032 __ Index(z29.VnH(), -1, -1);
1033 __ Splice(z7.VnH(), p2, z29.VnH(), z30.VnH());
1034 __ Splice(z8.VnH(), p3, z29.VnH(), z30.VnH());
1035
1036 int p2s_inputs[] = {0, 0, 1, 0};
1037 int p3s_inputs[] = {1, 0, 1, 0};
1038 Initialise(&masm, p2.VnS(), p2s_inputs);
1039 Initialise(&masm, p3.VnS(), p3s_inputs);
1040
1041 __ Index(z30.VnS(), 1, 1);
1042 __ Index(z29.VnS(), -1, -1);
1043 __ Splice(z9.VnS(), p2, z29.VnS(), z30.VnS());
1044 __ Splice(z10.VnS(), p3, z29.VnS(), z30.VnS());
1045
1046 int p2d_inputs[] = {0, 1};
1047 int p3d_inputs[] = {1, 0};
1048 Initialise(&masm, p2.VnD(), p2d_inputs);
1049 Initialise(&masm, p3.VnD(), p3d_inputs);
1050
1051 __ Index(z30.VnD(), 1, 1);
1052 __ Index(z29.VnD(), -1, -1);
1053 __ Splice(z11.VnD(), p2, z29.VnD(), z30.VnD());
1054 __ Splice(z30.VnD(), p3, z29.VnD(), z30.VnD());
1055
1056 END();
1057
1058 if (CAN_RUN()) {
1059 RUN();
1060 uint64_t z0_expected[] = {0xf0f1f2f3f4f5f6f7, 0xf8f9fafbfcfdfeff};
1061 uint64_t z1_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
1062 uint64_t z2_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201ff};
1063 uint64_t z3_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201fe};
1064 uint64_t z4_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201f0};
1065 uint64_t z5_expected[] = {0x0c0b0a0908070605, 0x04030201f6f7f8f9};
1066 uint64_t z6_expected[] = {0x01f0f1f2f3f4f5f6, 0xf7f8f9fafbfcfdfe};
1067 uint64_t z7_expected[] = {0x0007000600050004, 0x000300020001fffe};
1068 uint64_t z8_expected[] = {0x000300020001fffa, 0xfffbfffcfffdfffe};
1069 uint64_t z9_expected[] = {0x0000000300000002, 0x00000001fffffffe};
1070 uint64_t z10_expected[] = {0x00000001fffffffc, 0xfffffffdfffffffe};
1071 uint64_t z11_expected[] = {0x0000000000000001, 0xffffffffffffffff};
1072 uint64_t z30_expected[] = {0x0000000000000001, 0xfffffffffffffffe};
1073
1074 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1075 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1076 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
1077 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
1078 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1079 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1080 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
1081 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
1082 ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
1083 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1084 ASSERT_EQUAL_SVE(z10_expected, z10.VnD());
1085 ASSERT_EQUAL_SVE(z11_expected, z11.VnD());
1086 ASSERT_EQUAL_SVE(z30_expected, z30.VnD());
1087 }
1088}
1089
Jacob Bramleye8289202019-07-31 11:25:23 +01001090TEST_SVE(sve_predicate_logical) {
1091 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chongf4fa8222019-06-17 12:08:14 -07001092 START();
1093
1094 // 0b...01011010'10110111
1095 int p10_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1}; // Pm
1096 // 0b...11011001'01010010
1097 int p11_inputs[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0}; // Pn
1098 // 0b...01010101'10110010
1099 int p12_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0}; // pg
1100
1101 Initialise(&masm, p10.VnB(), p10_inputs);
1102 Initialise(&masm, p11.VnB(), p11_inputs);
1103 Initialise(&masm, p12.VnB(), p12_inputs);
1104
1105 __ Ands(p0.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1106 __ Mrs(x0, NZCV);
1107 __ Bics(p1.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1108 __ Mrs(x1, NZCV);
1109 __ Eor(p2.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1110 __ Nand(p3.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1111 __ Nor(p4.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1112 __ Orn(p5.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1113 __ Orr(p6.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1114 __ Sel(p7.VnB(), p12, p11.VnB(), p10.VnB());
1115
1116 END();
1117
1118 if (CAN_RUN()) {
1119 RUN();
1120
1121 // 0b...01010000'00010010
1122 int p0_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
1123 // 0b...00000001'00000000
1124 int p1_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0};
1125 // 0b...00000001'10100000
1126 int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1127 // 0b...00000101'10100000
1128 int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1129 // 0b...00000100'00000000
1130 int p4_expected[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1131 // 0b...01010101'00010010
1132 int p5_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
1133 // 0b...01010001'10110010
1134 int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
1135 // 0b...01011011'00010111
1136 int p7_expected[] = {0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1};
1137
1138 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
1139 ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
1140 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1141 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1142 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
1143 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
1144 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
1145 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
1146
TatWai Chong96713fe2019-06-04 16:39:37 -07001147 ASSERT_EQUAL_32(SVEFirstFlag, w0);
1148 ASSERT_EQUAL_32(SVENotLastFlag, w1);
1149 }
1150}
TatWai Chongf4fa8222019-06-17 12:08:14 -07001151
Jacob Bramleye8289202019-07-31 11:25:23 +01001152TEST_SVE(sve_int_compare_vectors) {
1153 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chong96713fe2019-06-04 16:39:37 -07001154 START();
1155
1156 int z10_inputs[] = {0x00, 0x80, 0xff, 0x7f, 0x00, 0x00, 0x00, 0xff};
1157 int z11_inputs[] = {0x00, 0x00, 0x00, 0x00, 0x80, 0xff, 0x7f, 0xfe};
1158 int p0_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1159 InsrHelper(&masm, z10.VnB(), z10_inputs);
1160 InsrHelper(&masm, z11.VnB(), z11_inputs);
1161 Initialise(&masm, p0.VnB(), p0_inputs);
1162
1163 __ Cmphs(p6.VnB(), p0.Zeroing(), z10.VnB(), z11.VnB());
1164 __ Mrs(x6, NZCV);
1165
1166 uint64_t z12_inputs[] = {0xffffffffffffffff, 0x8000000000000000};
1167 uint64_t z13_inputs[] = {0x0000000000000000, 0x8000000000000000};
1168 int p1_inputs[] = {1, 1};
1169 InsrHelper(&masm, z12.VnD(), z12_inputs);
1170 InsrHelper(&masm, z13.VnD(), z13_inputs);
1171 Initialise(&masm, p1.VnD(), p1_inputs);
1172
1173 __ Cmphi(p7.VnD(), p1.Zeroing(), z12.VnD(), z13.VnD());
1174 __ Mrs(x7, NZCV);
1175
1176 int z14_inputs[] = {0, 32767, -1, -32767, 0, 0, 0, 32766};
1177 int z15_inputs[] = {0, 0, 0, 0, 32767, -1, -32767, 32767};
1178
1179 int p2_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1180 InsrHelper(&masm, z14.VnH(), z14_inputs);
1181 InsrHelper(&masm, z15.VnH(), z15_inputs);
1182 Initialise(&masm, p2.VnH(), p2_inputs);
1183
1184 __ Cmpge(p8.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1185 __ Mrs(x8, NZCV);
1186
1187 __ Cmpeq(p9.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1188 __ Mrs(x9, NZCV);
1189
1190 int z16_inputs[] = {0, -1, 0, 0};
1191 int z17_inputs[] = {0, 0, 2147483647, -2147483648};
1192 int p3_inputs[] = {1, 1, 1, 1};
1193 InsrHelper(&masm, z16.VnS(), z16_inputs);
1194 InsrHelper(&masm, z17.VnS(), z17_inputs);
1195 Initialise(&masm, p3.VnS(), p3_inputs);
1196
1197 __ Cmpgt(p10.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1198 __ Mrs(x10, NZCV);
1199
1200 __ Cmpne(p11.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1201 __ Mrs(x11, NZCV);
1202
1203 // Architectural aliases testing.
1204 __ Cmpls(p12.VnB(), p0.Zeroing(), z11.VnB(), z10.VnB()); // HS
1205 __ Cmplo(p13.VnD(), p1.Zeroing(), z13.VnD(), z12.VnD()); // HI
1206 __ Cmple(p14.VnH(), p2.Zeroing(), z15.VnH(), z14.VnH()); // GE
1207 __ Cmplt(p15.VnS(), p3.Zeroing(), z17.VnS(), z16.VnS()); // GT
1208
1209 END();
1210
1211 if (CAN_RUN()) {
1212 RUN();
1213
1214 int p6_expected[] = {1, 0, 1, 1, 0, 0, 0, 1};
1215 for (size_t i = 0; i < ArrayLength(p6_expected); i++) {
1216 int lane = static_cast<int>(ArrayLength(p6_expected) - i - 1);
1217 ASSERT_EQUAL_SVE_LANE(p6_expected[i], p6.VnB(), lane);
1218 }
1219
1220 int p7_expected[] = {1, 0};
1221 ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
1222
1223 int p8_expected[] = {1, 0, 0, 0, 0, 1, 1, 0};
1224 ASSERT_EQUAL_SVE(p8_expected, p8.VnH());
1225
1226 int p9_expected[] = {1, 0, 0, 0, 0, 0, 0, 0};
1227 ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
1228
1229 int p10_expected[] = {0, 0, 0, 1};
1230 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1231
1232 int p11_expected[] = {0, 1, 1, 1};
1233 ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1234
1235 // Reuse the expected results to verify the architectural aliases.
1236 ASSERT_EQUAL_SVE(p6_expected, p12.VnB());
1237 ASSERT_EQUAL_SVE(p7_expected, p13.VnD());
1238 ASSERT_EQUAL_SVE(p8_expected, p14.VnH());
1239 ASSERT_EQUAL_SVE(p10_expected, p15.VnS());
1240
1241 ASSERT_EQUAL_32(SVEFirstFlag, w6);
1242 ASSERT_EQUAL_32(NoFlag, w7);
1243 ASSERT_EQUAL_32(NoFlag, w8);
1244 ASSERT_EQUAL_32(NoFlag, w9);
1245 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
1246 }
1247}
1248
Jacob Bramleye8289202019-07-31 11:25:23 +01001249TEST_SVE(sve_int_compare_vectors_wide_elements) {
1250 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chong96713fe2019-06-04 16:39:37 -07001251 START();
1252
1253 int src1_inputs_1[] = {0, 1, -1, -128, 127, 100, -66};
1254 int src2_inputs_1[] = {0, -1};
1255 int mask_inputs_1[] = {1, 1, 1, 1, 1, 0, 1};
1256 InsrHelper(&masm, z13.VnB(), src1_inputs_1);
1257 InsrHelper(&masm, z19.VnD(), src2_inputs_1);
1258 Initialise(&masm, p0.VnB(), mask_inputs_1);
1259
1260 __ Cmpge(p2.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1261 __ Mrs(x2, NZCV);
1262 __ Cmpgt(p3.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1263 __ Mrs(x3, NZCV);
1264
1265 int src1_inputs_2[] = {0, 32767, -1, -32767, 1, 1234, 0, 32766};
1266 int src2_inputs_2[] = {0, -32767};
1267 int mask_inputs_2[] = {1, 0, 1, 1, 1, 1, 1, 1};
1268 InsrHelper(&masm, z13.VnH(), src1_inputs_2);
1269 InsrHelper(&masm, z19.VnD(), src2_inputs_2);
1270 Initialise(&masm, p0.VnH(), mask_inputs_2);
1271
1272 __ Cmple(p4.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1273 __ Mrs(x4, NZCV);
1274 __ Cmplt(p5.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1275 __ Mrs(x5, NZCV);
1276
1277 int src1_inputs_3[] = {0, -1, 2147483647, -2147483648};
1278 int src2_inputs_3[] = {0, -2147483648};
1279 int mask_inputs_3[] = {1, 1, 1, 1};
1280 InsrHelper(&masm, z13.VnS(), src1_inputs_3);
1281 InsrHelper(&masm, z19.VnD(), src2_inputs_3);
1282 Initialise(&masm, p0.VnS(), mask_inputs_3);
1283
1284 __ Cmpeq(p6.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1285 __ Mrs(x6, NZCV);
1286 __ Cmpne(p7.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1287 __ Mrs(x7, NZCV);
1288
1289 int src1_inputs_4[] = {0x00, 0x80, 0x7f, 0xff, 0x7f, 0xf0, 0x0f, 0x55};
1290 int src2_inputs_4[] = {0x00, 0x7f};
1291 int mask_inputs_4[] = {1, 1, 1, 1, 0, 1, 1, 1};
1292 InsrHelper(&masm, z13.VnB(), src1_inputs_4);
1293 InsrHelper(&masm, z19.VnD(), src2_inputs_4);
1294 Initialise(&masm, p0.VnB(), mask_inputs_4);
1295
1296 __ Cmplo(p8.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1297 __ Mrs(x8, NZCV);
1298 __ Cmpls(p9.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1299 __ Mrs(x9, NZCV);
1300
1301 int src1_inputs_5[] = {0x0000, 0x8000, 0x7fff, 0xffff};
1302 int src2_inputs_5[] = {0x8000, 0xffff};
1303 int mask_inputs_5[] = {1, 1, 1, 1};
1304 InsrHelper(&masm, z13.VnS(), src1_inputs_5);
1305 InsrHelper(&masm, z19.VnD(), src2_inputs_5);
1306 Initialise(&masm, p0.VnS(), mask_inputs_5);
1307
1308 __ Cmphi(p10.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1309 __ Mrs(x10, NZCV);
1310 __ Cmphs(p11.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1311 __ Mrs(x11, NZCV);
1312
1313 END();
1314
1315 if (CAN_RUN()) {
1316 RUN();
1317 int p2_expected[] = {1, 1, 1, 0, 1, 0, 0};
1318 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1319
1320 int p3_expected[] = {1, 1, 0, 0, 1, 0, 0};
1321 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1322
1323 int p4_expected[] = {0x1, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1324 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
1325
1326 int p5_expected[] = {0x0, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1327 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
1328
1329 int p6_expected[] = {0x1, 0x0, 0x0, 0x1};
1330 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
1331
1332 int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
1333 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
1334
1335 int p8_expected[] = {1, 0, 0, 0, 0, 0, 1, 1};
1336 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
1337
1338 int p9_expected[] = {1, 0, 1, 0, 0, 0, 1, 1};
1339 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
1340
1341 int p10_expected[] = {0x0, 0x0, 0x0, 0x0};
1342 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1343
1344 int p11_expected[] = {0x0, 0x1, 0x0, 0x1};
1345 ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1346
1347 ASSERT_EQUAL_32(NoFlag, w2);
1348 ASSERT_EQUAL_32(NoFlag, w3);
1349 ASSERT_EQUAL_32(NoFlag, w4);
1350 ASSERT_EQUAL_32(SVENotLastFlag, w5);
1351 ASSERT_EQUAL_32(SVEFirstFlag, w6);
1352 ASSERT_EQUAL_32(SVENotLastFlag, w7);
1353 ASSERT_EQUAL_32(SVEFirstFlag, w8);
1354 ASSERT_EQUAL_32(SVEFirstFlag, w9);
1355 ASSERT_EQUAL_32(SVENotLastFlag | SVENoneFlag, w10);
1356 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w11);
TatWai Chongf4fa8222019-06-17 12:08:14 -07001357 }
TatWai Chongf4fa8222019-06-17 12:08:14 -07001358}
1359
Jacob Bramleye8289202019-07-31 11:25:23 +01001360TEST_SVE(sve_bitwise_imm) {
1361 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chonga1885a52019-04-15 17:19:14 -07001362 START();
1363
1364 // clang-format off
1365 uint64_t z21_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
1366 uint32_t z22_inputs[] = {0xfedcba98, 0x76543210, 0x01234567, 0x89abcdef};
1367 uint16_t z23_inputs[] = {0xfedc, 0xba98, 0x7654, 0x3210,
1368 0x0123, 0x4567, 0x89ab, 0xcdef};
1369 uint8_t z24_inputs[] = {0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
1370 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
1371 // clang-format on
1372
1373 InsrHelper(&masm, z1.VnD(), z21_inputs);
1374 InsrHelper(&masm, z2.VnS(), z22_inputs);
1375 InsrHelper(&masm, z3.VnH(), z23_inputs);
1376 InsrHelper(&masm, z4.VnB(), z24_inputs);
1377
1378 __ And(z1.VnD(), z1.VnD(), 0x0000ffff0000ffff);
1379 __ And(z2.VnS(), z2.VnS(), 0xff0000ff);
1380 __ And(z3.VnH(), z3.VnH(), 0x0ff0);
1381 __ And(z4.VnB(), z4.VnB(), 0x3f);
1382
1383 InsrHelper(&masm, z5.VnD(), z21_inputs);
1384 InsrHelper(&masm, z6.VnS(), z22_inputs);
1385 InsrHelper(&masm, z7.VnH(), z23_inputs);
1386 InsrHelper(&masm, z8.VnB(), z24_inputs);
1387
1388 __ Eor(z5.VnD(), z5.VnD(), 0x0000ffff0000ffff);
1389 __ Eor(z6.VnS(), z6.VnS(), 0xff0000ff);
1390 __ Eor(z7.VnH(), z7.VnH(), 0x0ff0);
1391 __ Eor(z8.VnB(), z8.VnB(), 0x3f);
1392
1393 InsrHelper(&masm, z9.VnD(), z21_inputs);
1394 InsrHelper(&masm, z10.VnS(), z22_inputs);
1395 InsrHelper(&masm, z11.VnH(), z23_inputs);
1396 InsrHelper(&masm, z12.VnB(), z24_inputs);
1397
1398 __ Orr(z9.VnD(), z9.VnD(), 0x0000ffff0000ffff);
1399 __ Orr(z10.VnS(), z10.VnS(), 0xff0000ff);
1400 __ Orr(z11.VnH(), z11.VnH(), 0x0ff0);
1401 __ Orr(z12.VnB(), z12.VnB(), 0x3f);
1402
Jacob Bramley6069fd42019-06-24 10:20:45 +01001403 {
1404 // The `Dup` macro maps onto either `dup` or `dupm`, but has its own test,
1405 // so here we test `dupm` directly.
1406 ExactAssemblyScope guard(&masm, 4 * kInstructionSize);
1407 __ dupm(z13.VnD(), 0x7ffffff800000000);
1408 __ dupm(z14.VnS(), 0x7ffc7ffc);
1409 __ dupm(z15.VnH(), 0x3ffc);
1410 __ dupm(z16.VnB(), 0xc3);
1411 }
TatWai Chonga1885a52019-04-15 17:19:14 -07001412
1413 END();
1414
1415 if (CAN_RUN()) {
1416 RUN();
1417
1418 // clang-format off
1419 uint64_t z1_expected[] = {0x0000ba9800003210, 0x000045670000cdef};
1420 uint32_t z2_expected[] = {0xfe000098, 0x76000010, 0x01000067, 0x890000ef};
1421 uint16_t z3_expected[] = {0x0ed0, 0x0a90, 0x0650, 0x0210,
1422 0x0120, 0x0560, 0x09a0, 0x0de0};
1423 uint8_t z4_expected[] = {0x3e, 0x1c, 0x3a, 0x18, 0x36, 0x14, 0x32, 0x10,
1424 0x01, 0x23, 0x05, 0x27, 0x09, 0x2b, 0x0d, 0x2f};
1425
1426 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1427 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1428 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1429 ASSERT_EQUAL_SVE(z4_expected, z4.VnB());
1430
1431 uint64_t z5_expected[] = {0xfedc45677654cdef, 0x0123ba9889ab3210};
1432 uint32_t z6_expected[] = {0x01dcba67, 0x895432ef, 0xfe234598, 0x76abcd10};
1433 uint16_t z7_expected[] = {0xf12c, 0xb568, 0x79a4, 0x3de0,
1434 0x0ed3, 0x4a97, 0x865b, 0xc21f};
1435 uint8_t z8_expected[] = {0xc1, 0xe3, 0x85, 0xa7, 0x49, 0x6b, 0x0d, 0x2f,
1436 0x3e, 0x1c, 0x7a, 0x58, 0xb6, 0x94, 0xf2, 0xd0};
1437
1438 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1439 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1440 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1441 ASSERT_EQUAL_SVE(z8_expected, z8.VnB());
1442
1443 uint64_t z9_expected[] = {0xfedcffff7654ffff, 0x0123ffff89abffff};
1444 uint32_t z10_expected[] = {0xffdcbaff, 0xff5432ff, 0xff2345ff, 0xffabcdff};
1445 uint16_t z11_expected[] = {0xfffc, 0xbff8, 0x7ff4, 0x3ff0,
1446 0x0ff3, 0x4ff7, 0x8ffb, 0xcfff};
1447 uint8_t z12_expected[] = {0xff, 0xff, 0xbf, 0xbf, 0x7f, 0x7f, 0x3f, 0x3f,
1448 0x3f, 0x3f, 0x7f, 0x7f, 0xbf, 0xbf, 0xff, 0xff};
1449
1450 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1451 ASSERT_EQUAL_SVE(z10_expected, z10.VnS());
1452 ASSERT_EQUAL_SVE(z11_expected, z11.VnH());
1453 ASSERT_EQUAL_SVE(z12_expected, z12.VnB());
1454
1455 uint64_t z13_expected[] = {0x7ffffff800000000, 0x7ffffff800000000};
1456 uint32_t z14_expected[] = {0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc};
1457 uint16_t z15_expected[] = {0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc,
1458 0x3ffc, 0x3ffc, 0x3ffc ,0x3ffc};
1459 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
1460 ASSERT_EQUAL_SVE(z14_expected, z14.VnS());
1461 ASSERT_EQUAL_SVE(z15_expected, z15.VnH());
1462 // clang-format on
1463 }
TatWai Chonga1885a52019-04-15 17:19:14 -07001464}
1465
Jacob Bramleye8289202019-07-31 11:25:23 +01001466TEST_SVE(sve_dup_imm) {
Jacob Bramley6069fd42019-06-24 10:20:45 +01001467 // The `Dup` macro can generate `dup`, `dupm`, and it can synthesise
1468 // unencodable immediates.
1469
Jacob Bramleye8289202019-07-31 11:25:23 +01001470 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley6069fd42019-06-24 10:20:45 +01001471 START();
1472
1473 // Encodable with `dup` (shift 0).
1474 __ Dup(z0.VnD(), -1);
1475 __ Dup(z1.VnS(), 0x7f);
1476 __ Dup(z2.VnH(), -0x80);
1477 __ Dup(z3.VnB(), 42);
1478
1479 // Encodable with `dup` (shift 8).
TatWai Chong6995bfd2019-09-26 10:48:05 +01001480 __ Dup(z4.VnD(), -42 * 256);
1481 __ Dup(z5.VnS(), -0x8000);
1482 __ Dup(z6.VnH(), 0x7f00);
Jacob Bramley6069fd42019-06-24 10:20:45 +01001483 // B-sized lanes cannot take a shift of 8.
1484
1485 // Encodable with `dupm` (but not `dup`).
1486 __ Dup(z10.VnD(), 0x3fc);
1487 __ Dup(z11.VnS(), -516097); // 0xfff81fff, as a signed int.
1488 __ Dup(z12.VnH(), 0x0001);
1489 // All values that fit B-sized lanes are encodable with `dup`.
1490
1491 // Cases that require immediate synthesis.
1492 __ Dup(z20.VnD(), 0x1234);
1493 __ Dup(z21.VnD(), -4242);
1494 __ Dup(z22.VnD(), 0xfedcba9876543210);
1495 __ Dup(z23.VnS(), 0x01020304);
1496 __ Dup(z24.VnS(), -0x01020304);
1497 __ Dup(z25.VnH(), 0x3c38);
1498 // All values that fit B-sized lanes are directly encodable.
1499
1500 END();
1501
1502 if (CAN_RUN()) {
1503 RUN();
1504
1505 ASSERT_EQUAL_SVE(0xffffffffffffffff, z0.VnD());
1506 ASSERT_EQUAL_SVE(0x0000007f, z1.VnS());
1507 ASSERT_EQUAL_SVE(0xff80, z2.VnH());
1508 ASSERT_EQUAL_SVE(0x2a, z3.VnB());
1509
TatWai Chong6995bfd2019-09-26 10:48:05 +01001510 ASSERT_EQUAL_SVE(0xffffffffffffd600, z4.VnD());
1511 ASSERT_EQUAL_SVE(0xffff8000, z5.VnS());
1512 ASSERT_EQUAL_SVE(0x7f00, z6.VnH());
Jacob Bramley6069fd42019-06-24 10:20:45 +01001513
1514 ASSERT_EQUAL_SVE(0x00000000000003fc, z10.VnD());
1515 ASSERT_EQUAL_SVE(0xfff81fff, z11.VnS());
1516 ASSERT_EQUAL_SVE(0x0001, z12.VnH());
1517
1518 ASSERT_EQUAL_SVE(0x1234, z20.VnD());
1519 ASSERT_EQUAL_SVE(0xffffffffffffef6e, z21.VnD());
1520 ASSERT_EQUAL_SVE(0xfedcba9876543210, z22.VnD());
1521 ASSERT_EQUAL_SVE(0x01020304, z23.VnS());
1522 ASSERT_EQUAL_SVE(0xfefdfcfc, z24.VnS());
1523 ASSERT_EQUAL_SVE(0x3c38, z25.VnH());
1524 }
1525}
1526
Jacob Bramleye8289202019-07-31 11:25:23 +01001527TEST_SVE(sve_inc_dec_p_scalar) {
1528 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001529 START();
1530
1531 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1532 Initialise(&masm, p0.VnB(), p0_inputs);
1533
1534 int p0_b_count = 9;
1535 int p0_h_count = 5;
1536 int p0_s_count = 3;
1537 int p0_d_count = 2;
1538
1539 // 64-bit operations preserve their high bits.
1540 __ Mov(x0, 0x123456780000002a);
1541 __ Decp(x0, p0.VnB());
1542
1543 __ Mov(x1, 0x123456780000002a);
1544 __ Incp(x1, p0.VnH());
1545
1546 // Check that saturation does not occur.
1547 __ Mov(x10, 1);
1548 __ Decp(x10, p0.VnS());
1549
1550 __ Mov(x11, UINT64_MAX);
1551 __ Incp(x11, p0.VnD());
1552
1553 __ Mov(x12, INT64_MAX);
1554 __ Incp(x12, p0.VnB());
1555
1556 // With an all-true predicate, these instructions increment or decrement by
1557 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001558 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001559
1560 __ Mov(x20, 0x4000000000000000);
1561 __ Decp(x20, p15.VnB());
1562
1563 __ Mov(x21, 0x4000000000000000);
1564 __ Incp(x21, p15.VnH());
1565
1566 END();
1567 if (CAN_RUN()) {
1568 RUN();
1569
1570 ASSERT_EQUAL_64(0x123456780000002a - p0_b_count, x0);
1571 ASSERT_EQUAL_64(0x123456780000002a + p0_h_count, x1);
1572
1573 ASSERT_EQUAL_64(UINT64_C(1) - p0_s_count, x10);
1574 ASSERT_EQUAL_64(UINT64_MAX + p0_d_count, x11);
1575 ASSERT_EQUAL_64(static_cast<uint64_t>(INT64_MAX) + p0_b_count, x12);
1576
1577 ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1578 ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1579 }
1580}
1581
Jacob Bramleye8289202019-07-31 11:25:23 +01001582TEST_SVE(sve_sqinc_sqdec_p_scalar) {
1583 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001584 START();
1585
1586 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1587 Initialise(&masm, p0.VnB(), p0_inputs);
1588
1589 int p0_b_count = 9;
1590 int p0_h_count = 5;
1591 int p0_s_count = 3;
1592 int p0_d_count = 2;
1593
1594 uint64_t dummy_high = 0x1234567800000000;
1595
1596 // 64-bit operations preserve their high bits.
1597 __ Mov(x0, dummy_high + 42);
1598 __ Sqdecp(x0, p0.VnB());
1599
1600 __ Mov(x1, dummy_high + 42);
1601 __ Sqincp(x1, p0.VnH());
1602
1603 // 32-bit operations sign-extend into their high bits.
1604 __ Mov(x2, dummy_high + 42);
1605 __ Sqdecp(x2, p0.VnS(), w2);
1606
1607 __ Mov(x3, dummy_high + 42);
1608 __ Sqincp(x3, p0.VnD(), w3);
1609
1610 __ Mov(x4, dummy_high + 1);
1611 __ Sqdecp(x4, p0.VnS(), w4);
1612
1613 __ Mov(x5, dummy_high - 1);
1614 __ Sqincp(x5, p0.VnD(), w5);
1615
1616 // Check that saturation behaves correctly.
1617 __ Mov(x10, 0x8000000000000001); // INT64_MIN + 1
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001618 __ Sqdecp(x10, p0.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001619
1620 __ Mov(x11, dummy_high + 0x80000001); // INT32_MIN + 1
1621 __ Sqdecp(x11, p0.VnH(), w11);
1622
1623 __ Mov(x12, 1);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001624 __ Sqdecp(x12, p0.VnS());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001625
1626 __ Mov(x13, dummy_high + 1);
1627 __ Sqdecp(x13, p0.VnD(), w13);
1628
1629 __ Mov(x14, 0x7ffffffffffffffe); // INT64_MAX - 1
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001630 __ Sqincp(x14, p0.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001631
1632 __ Mov(x15, dummy_high + 0x7ffffffe); // INT32_MAX - 1
1633 __ Sqincp(x15, p0.VnH(), w15);
1634
1635 // Don't use x16 and x17 since they are scratch registers by default.
1636
1637 __ Mov(x18, 0xffffffffffffffff);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001638 __ Sqincp(x18, p0.VnS());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001639
1640 __ Mov(x19, dummy_high + 0xffffffff);
1641 __ Sqincp(x19, p0.VnD(), w19);
1642
1643 __ Mov(x20, dummy_high + 0xffffffff);
1644 __ Sqdecp(x20, p0.VnB(), w20);
1645
1646 // With an all-true predicate, these instructions increment or decrement by
1647 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001648 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001649
1650 __ Mov(x21, 0);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001651 __ Sqdecp(x21, p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001652
1653 __ Mov(x22, 0);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001654 __ Sqincp(x22, p15.VnH());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001655
1656 __ Mov(x23, dummy_high);
1657 __ Sqdecp(x23, p15.VnS(), w23);
1658
1659 __ Mov(x24, dummy_high);
1660 __ Sqincp(x24, p15.VnD(), w24);
1661
1662 END();
1663 if (CAN_RUN()) {
1664 RUN();
1665
1666 // 64-bit operations preserve their high bits.
1667 ASSERT_EQUAL_64(dummy_high + 42 - p0_b_count, x0);
1668 ASSERT_EQUAL_64(dummy_high + 42 + p0_h_count, x1);
1669
1670 // 32-bit operations sign-extend into their high bits.
1671 ASSERT_EQUAL_64(42 - p0_s_count, x2);
1672 ASSERT_EQUAL_64(42 + p0_d_count, x3);
1673 ASSERT_EQUAL_64(0xffffffff00000000 | (1 - p0_s_count), x4);
1674 ASSERT_EQUAL_64(p0_d_count - 1, x5);
1675
1676 // Check that saturation behaves correctly.
1677 ASSERT_EQUAL_64(INT64_MIN, x10);
1678 ASSERT_EQUAL_64(INT32_MIN, x11);
1679 ASSERT_EQUAL_64(1 - p0_s_count, x12);
1680 ASSERT_EQUAL_64(1 - p0_d_count, x13);
1681 ASSERT_EQUAL_64(INT64_MAX, x14);
1682 ASSERT_EQUAL_64(INT32_MAX, x15);
1683 ASSERT_EQUAL_64(p0_s_count - 1, x18);
1684 ASSERT_EQUAL_64(p0_d_count - 1, x19);
1685 ASSERT_EQUAL_64(-1 - p0_b_count, x20);
1686
1687 // Check all-true predicates.
1688 ASSERT_EQUAL_64(-core.GetSVELaneCount(kBRegSize), x21);
1689 ASSERT_EQUAL_64(core.GetSVELaneCount(kHRegSize), x22);
1690 ASSERT_EQUAL_64(-core.GetSVELaneCount(kSRegSize), x23);
1691 ASSERT_EQUAL_64(core.GetSVELaneCount(kDRegSize), x24);
1692 }
1693}
1694
Jacob Bramleye8289202019-07-31 11:25:23 +01001695TEST_SVE(sve_uqinc_uqdec_p_scalar) {
1696 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001697 START();
1698
1699 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1700 Initialise(&masm, p0.VnB(), p0_inputs);
1701
1702 int p0_b_count = 9;
1703 int p0_h_count = 5;
1704 int p0_s_count = 3;
1705 int p0_d_count = 2;
1706
1707 uint64_t dummy_high = 0x1234567800000000;
1708
1709 // 64-bit operations preserve their high bits.
1710 __ Mov(x0, dummy_high + 42);
1711 __ Uqdecp(x0, p0.VnB());
1712
1713 __ Mov(x1, dummy_high + 42);
1714 __ Uqincp(x1, p0.VnH());
1715
1716 // 32-bit operations zero-extend into their high bits.
1717 __ Mov(x2, dummy_high + 42);
1718 __ Uqdecp(x2, p0.VnS(), w2);
1719
1720 __ Mov(x3, dummy_high + 42);
1721 __ Uqincp(x3, p0.VnD(), w3);
1722
1723 __ Mov(x4, dummy_high + 0x80000001);
1724 __ Uqdecp(x4, p0.VnS(), w4);
1725
1726 __ Mov(x5, dummy_high + 0x7fffffff);
1727 __ Uqincp(x5, p0.VnD(), w5);
1728
1729 // Check that saturation behaves correctly.
1730 __ Mov(x10, 1);
1731 __ Uqdecp(x10, p0.VnB(), x10);
1732
1733 __ Mov(x11, dummy_high + 1);
1734 __ Uqdecp(x11, p0.VnH(), w11);
1735
1736 __ Mov(x12, 0x8000000000000000); // INT64_MAX + 1
1737 __ Uqdecp(x12, p0.VnS(), x12);
1738
1739 __ Mov(x13, dummy_high + 0x80000000); // INT32_MAX + 1
1740 __ Uqdecp(x13, p0.VnD(), w13);
1741
1742 __ Mov(x14, 0xfffffffffffffffe); // UINT64_MAX - 1
1743 __ Uqincp(x14, p0.VnB(), x14);
1744
1745 __ Mov(x15, dummy_high + 0xfffffffe); // UINT32_MAX - 1
1746 __ Uqincp(x15, p0.VnH(), w15);
1747
1748 // Don't use x16 and x17 since they are scratch registers by default.
1749
1750 __ Mov(x18, 0x7ffffffffffffffe); // INT64_MAX - 1
1751 __ Uqincp(x18, p0.VnS(), x18);
1752
1753 __ Mov(x19, dummy_high + 0x7ffffffe); // INT32_MAX - 1
1754 __ Uqincp(x19, p0.VnD(), w19);
1755
1756 // With an all-true predicate, these instructions increment or decrement by
1757 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001758 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001759
1760 __ Mov(x20, 0x4000000000000000);
1761 __ Uqdecp(x20, p15.VnB(), x20);
1762
1763 __ Mov(x21, 0x4000000000000000);
1764 __ Uqincp(x21, p15.VnH(), x21);
1765
1766 __ Mov(x22, dummy_high + 0x40000000);
1767 __ Uqdecp(x22, p15.VnS(), w22);
1768
1769 __ Mov(x23, dummy_high + 0x40000000);
1770 __ Uqincp(x23, p15.VnD(), w23);
1771
1772 END();
1773 if (CAN_RUN()) {
1774 RUN();
1775
1776 // 64-bit operations preserve their high bits.
1777 ASSERT_EQUAL_64(dummy_high + 42 - p0_b_count, x0);
1778 ASSERT_EQUAL_64(dummy_high + 42 + p0_h_count, x1);
1779
1780 // 32-bit operations zero-extend into their high bits.
1781 ASSERT_EQUAL_64(42 - p0_s_count, x2);
1782 ASSERT_EQUAL_64(42 + p0_d_count, x3);
1783 ASSERT_EQUAL_64(UINT64_C(0x80000001) - p0_s_count, x4);
1784 ASSERT_EQUAL_64(UINT64_C(0x7fffffff) + p0_d_count, x5);
1785
1786 // Check that saturation behaves correctly.
1787 ASSERT_EQUAL_64(0, x10);
1788 ASSERT_EQUAL_64(0, x11);
1789 ASSERT_EQUAL_64(0x8000000000000000 - p0_s_count, x12);
1790 ASSERT_EQUAL_64(UINT64_C(0x80000000) - p0_d_count, x13);
1791 ASSERT_EQUAL_64(UINT64_MAX, x14);
1792 ASSERT_EQUAL_64(UINT32_MAX, x15);
1793 ASSERT_EQUAL_64(0x7ffffffffffffffe + p0_s_count, x18);
1794 ASSERT_EQUAL_64(UINT64_C(0x7ffffffe) + p0_d_count, x19);
1795
1796 // Check all-true predicates.
1797 ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1798 ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1799 ASSERT_EQUAL_64(0x40000000 - core.GetSVELaneCount(kSRegSize), x22);
1800 ASSERT_EQUAL_64(0x40000000 + core.GetSVELaneCount(kDRegSize), x23);
1801 }
1802}
1803
Jacob Bramleye8289202019-07-31 11:25:23 +01001804TEST_SVE(sve_inc_dec_p_vector) {
1805 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001806 START();
1807
1808 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
1809 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1810 Initialise(&masm, p0.VnB(), p0_inputs);
1811
1812 // Check that saturation does not occur.
1813
1814 int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
1815 InsrHelper(&masm, z0.VnD(), z0_inputs);
1816
1817 int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
1818 InsrHelper(&masm, z1.VnD(), z1_inputs);
1819
1820 int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
1821 InsrHelper(&masm, z2.VnS(), z2_inputs);
1822
1823 int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
1824 InsrHelper(&masm, z3.VnH(), z3_inputs);
1825
1826 // The MacroAssembler implements non-destructive operations using movprfx.
1827 __ Decp(z10.VnD(), p0, z0.VnD());
1828 __ Decp(z11.VnD(), p0, z1.VnD());
1829 __ Decp(z12.VnS(), p0, z2.VnS());
1830 __ Decp(z13.VnH(), p0, z3.VnH());
1831
1832 __ Incp(z14.VnD(), p0, z0.VnD());
1833 __ Incp(z15.VnD(), p0, z1.VnD());
1834 __ Incp(z16.VnS(), p0, z2.VnS());
1835 __ Incp(z17.VnH(), p0, z3.VnH());
1836
1837 // Also test destructive forms.
1838 __ Mov(z4, z0);
1839 __ Mov(z5, z1);
1840 __ Mov(z6, z2);
1841 __ Mov(z7, z3);
1842
1843 __ Decp(z0.VnD(), p0);
1844 __ Decp(z1.VnD(), p0);
1845 __ Decp(z2.VnS(), p0);
1846 __ Decp(z3.VnH(), p0);
1847
1848 __ Incp(z4.VnD(), p0);
1849 __ Incp(z5.VnD(), p0);
1850 __ Incp(z6.VnS(), p0);
1851 __ Incp(z7.VnH(), p0);
1852
1853 END();
1854 if (CAN_RUN()) {
1855 RUN();
1856
1857 // z0_inputs[...] - number of active D lanes (2)
1858 int64_t z0_expected[] = {0x1234567800000040, -2, -1, 0x7ffffffffffffffe};
1859 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1860
1861 // z1_inputs[...] - number of active D lanes (2)
1862 int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
1863 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1864
1865 // z2_inputs[...] - number of active S lanes (3)
1866 int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, 0x7ffffffd};
1867 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1868
1869 // z3_inputs[...] - number of active H lanes (5)
1870 int16_t z3_expected[] = {0x1225, -5, -4, -6, 0x7ffb, 0x7ffa};
1871 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1872
1873 // z0_inputs[...] + number of active D lanes (2)
1874 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
1875 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1876
1877 // z1_inputs[...] + number of active D lanes (2)
1878 uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, 0x8000000000000001};
1879 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1880
1881 // z2_inputs[...] + number of active S lanes (3)
1882 uint32_t z6_expected[] = {0x12340045, 3, 2, 4, 0x80000002, 0x80000003};
1883 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1884
1885 // z3_inputs[...] + number of active H lanes (5)
1886 uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, 0x8004};
1887 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1888
1889 // Check that the non-destructive macros produced the same results.
1890 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
1891 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
1892 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
1893 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
1894 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
1895 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
1896 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
1897 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
1898 }
1899}
1900
Jacob Bramleye8289202019-07-31 11:25:23 +01001901TEST_SVE(sve_inc_dec_ptrue_vector) {
1902 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001903 START();
1904
1905 // With an all-true predicate, these instructions increment or decrement by
1906 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001907 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001908
1909 __ Dup(z0.VnD(), 0);
1910 __ Decp(z0.VnD(), p15);
1911
1912 __ Dup(z1.VnS(), 0);
1913 __ Decp(z1.VnS(), p15);
1914
1915 __ Dup(z2.VnH(), 0);
1916 __ Decp(z2.VnH(), p15);
1917
1918 __ Dup(z3.VnD(), 0);
1919 __ Incp(z3.VnD(), p15);
1920
1921 __ Dup(z4.VnS(), 0);
1922 __ Incp(z4.VnS(), p15);
1923
1924 __ Dup(z5.VnH(), 0);
1925 __ Incp(z5.VnH(), p15);
1926
1927 END();
1928 if (CAN_RUN()) {
1929 RUN();
1930
1931 int d_lane_count = core.GetSVELaneCount(kDRegSize);
1932 int s_lane_count = core.GetSVELaneCount(kSRegSize);
1933 int h_lane_count = core.GetSVELaneCount(kHRegSize);
1934
1935 for (int i = 0; i < d_lane_count; i++) {
1936 ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
1937 ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
1938 }
1939
1940 for (int i = 0; i < s_lane_count; i++) {
1941 ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
1942 ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
1943 }
1944
1945 for (int i = 0; i < h_lane_count; i++) {
1946 ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
1947 ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
1948 }
1949 }
1950}
1951
Jacob Bramleye8289202019-07-31 11:25:23 +01001952TEST_SVE(sve_sqinc_sqdec_p_vector) {
1953 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001954 START();
1955
1956 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
1957 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1958 Initialise(&masm, p0.VnB(), p0_inputs);
1959
1960 // Check that saturation behaves correctly.
1961
1962 int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
1963 InsrHelper(&masm, z0.VnD(), z0_inputs);
1964
1965 int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
1966 InsrHelper(&masm, z1.VnD(), z1_inputs);
1967
1968 int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
1969 InsrHelper(&masm, z2.VnS(), z2_inputs);
1970
1971 int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
1972 InsrHelper(&masm, z3.VnH(), z3_inputs);
1973
1974 // The MacroAssembler implements non-destructive operations using movprfx.
1975 __ Sqdecp(z10.VnD(), p0, z0.VnD());
1976 __ Sqdecp(z11.VnD(), p0, z1.VnD());
1977 __ Sqdecp(z12.VnS(), p0, z2.VnS());
1978 __ Sqdecp(z13.VnH(), p0, z3.VnH());
1979
1980 __ Sqincp(z14.VnD(), p0, z0.VnD());
1981 __ Sqincp(z15.VnD(), p0, z1.VnD());
1982 __ Sqincp(z16.VnS(), p0, z2.VnS());
1983 __ Sqincp(z17.VnH(), p0, z3.VnH());
1984
1985 // Also test destructive forms.
1986 __ Mov(z4, z0);
1987 __ Mov(z5, z1);
1988 __ Mov(z6, z2);
1989 __ Mov(z7, z3);
1990
1991 __ Sqdecp(z0.VnD(), p0);
1992 __ Sqdecp(z1.VnD(), p0);
1993 __ Sqdecp(z2.VnS(), p0);
1994 __ Sqdecp(z3.VnH(), p0);
1995
1996 __ Sqincp(z4.VnD(), p0);
1997 __ Sqincp(z5.VnD(), p0);
1998 __ Sqincp(z6.VnS(), p0);
1999 __ Sqincp(z7.VnH(), p0);
2000
2001 END();
2002 if (CAN_RUN()) {
2003 RUN();
2004
2005 // z0_inputs[...] - number of active D lanes (2)
2006 int64_t z0_expected[] = {0x1234567800000040, -2, -1, INT64_MIN};
2007 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
2008
2009 // z1_inputs[...] - number of active D lanes (2)
2010 int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
2011 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
2012
2013 // z2_inputs[...] - number of active S lanes (3)
2014 int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, INT32_MIN};
2015 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
2016
2017 // z3_inputs[...] - number of active H lanes (5)
2018 int16_t z3_expected[] = {0x1225, -5, -4, -6, INT16_MIN, 0x7ffa};
2019 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
2020
2021 // z0_inputs[...] + number of active D lanes (2)
2022 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2023 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2024
2025 // z1_inputs[...] + number of active D lanes (2)
2026 uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, INT64_MAX};
2027 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2028
2029 // z2_inputs[...] + number of active S lanes (3)
2030 uint32_t z6_expected[] = {0x12340045, 3, 2, 4, INT32_MAX, 0x80000003};
2031 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2032
2033 // z3_inputs[...] + number of active H lanes (5)
2034 uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, INT16_MAX};
2035 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2036
2037 // Check that the non-destructive macros produced the same results.
2038 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2039 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2040 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2041 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2042 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2043 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2044 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2045 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2046 }
2047}
2048
Jacob Bramleye8289202019-07-31 11:25:23 +01002049TEST_SVE(sve_sqinc_sqdec_ptrue_vector) {
2050 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002051 START();
2052
2053 // With an all-true predicate, these instructions increment or decrement by
2054 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01002055 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002056
2057 __ Dup(z0.VnD(), 0);
2058 __ Sqdecp(z0.VnD(), p15);
2059
2060 __ Dup(z1.VnS(), 0);
2061 __ Sqdecp(z1.VnS(), p15);
2062
2063 __ Dup(z2.VnH(), 0);
2064 __ Sqdecp(z2.VnH(), p15);
2065
2066 __ Dup(z3.VnD(), 0);
2067 __ Sqincp(z3.VnD(), p15);
2068
2069 __ Dup(z4.VnS(), 0);
2070 __ Sqincp(z4.VnS(), p15);
2071
2072 __ Dup(z5.VnH(), 0);
2073 __ Sqincp(z5.VnH(), p15);
2074
2075 END();
2076 if (CAN_RUN()) {
2077 RUN();
2078
2079 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2080 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2081 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2082
2083 for (int i = 0; i < d_lane_count; i++) {
2084 ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
2085 ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
2086 }
2087
2088 for (int i = 0; i < s_lane_count; i++) {
2089 ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
2090 ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
2091 }
2092
2093 for (int i = 0; i < h_lane_count; i++) {
2094 ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
2095 ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
2096 }
2097 }
2098}
2099
Jacob Bramleye8289202019-07-31 11:25:23 +01002100TEST_SVE(sve_uqinc_uqdec_p_vector) {
2101 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002102 START();
2103
2104 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
2105 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
2106 Initialise(&masm, p0.VnB(), p0_inputs);
2107
2108 // Check that saturation behaves correctly.
2109
2110 uint64_t z0_inputs[] = {0x1234567800000042, 0, 1, 0x8000000000000000};
2111 InsrHelper(&masm, z0.VnD(), z0_inputs);
2112
2113 uint64_t z1_inputs[] = {0x12345678ffffff2a, 0, UINT64_MAX, INT64_MAX};
2114 InsrHelper(&masm, z1.VnD(), z1_inputs);
2115
2116 uint32_t z2_inputs[] = {0x12340042, 0, UINT32_MAX, 1, INT32_MAX, 0x80000000};
2117 InsrHelper(&masm, z2.VnS(), z2_inputs);
2118
2119 uint16_t z3_inputs[] = {0x122a, 0, 1, UINT16_MAX, 0x8000, INT16_MAX};
2120 InsrHelper(&masm, z3.VnH(), z3_inputs);
2121
2122 // The MacroAssembler implements non-destructive operations using movprfx.
2123 __ Uqdecp(z10.VnD(), p0, z0.VnD());
2124 __ Uqdecp(z11.VnD(), p0, z1.VnD());
2125 __ Uqdecp(z12.VnS(), p0, z2.VnS());
2126 __ Uqdecp(z13.VnH(), p0, z3.VnH());
2127
2128 __ Uqincp(z14.VnD(), p0, z0.VnD());
2129 __ Uqincp(z15.VnD(), p0, z1.VnD());
2130 __ Uqincp(z16.VnS(), p0, z2.VnS());
2131 __ Uqincp(z17.VnH(), p0, z3.VnH());
2132
2133 // Also test destructive forms.
2134 __ Mov(z4, z0);
2135 __ Mov(z5, z1);
2136 __ Mov(z6, z2);
2137 __ Mov(z7, z3);
2138
2139 __ Uqdecp(z0.VnD(), p0);
2140 __ Uqdecp(z1.VnD(), p0);
2141 __ Uqdecp(z2.VnS(), p0);
2142 __ Uqdecp(z3.VnH(), p0);
2143
2144 __ Uqincp(z4.VnD(), p0);
2145 __ Uqincp(z5.VnD(), p0);
2146 __ Uqincp(z6.VnS(), p0);
2147 __ Uqincp(z7.VnH(), p0);
2148
2149 END();
2150 if (CAN_RUN()) {
2151 RUN();
2152
2153 // z0_inputs[...] - number of active D lanes (2)
2154 uint64_t z0_expected[] = {0x1234567800000040, 0, 0, 0x7ffffffffffffffe};
2155 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
2156
2157 // z1_inputs[...] - number of active D lanes (2)
2158 uint64_t z1_expected[] = {0x12345678ffffff28,
2159 0,
2160 0xfffffffffffffffd,
2161 0x7ffffffffffffffd};
2162 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
2163
2164 // z2_inputs[...] - number of active S lanes (3)
2165 uint32_t z2_expected[] =
2166 {0x1234003f, 0, 0xfffffffc, 0, 0x7ffffffc, 0x7ffffffd};
2167 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
2168
2169 // z3_inputs[...] - number of active H lanes (5)
2170 uint16_t z3_expected[] = {0x1225, 0, 0, 0xfffa, 0x7ffb, 0x7ffa};
2171 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
2172
2173 // z0_inputs[...] + number of active D lanes (2)
2174 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2175 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2176
2177 // z1_inputs[...] + number of active D lanes (2)
2178 uint64_t z5_expected[] = {0x12345678ffffff2c,
2179 2,
2180 UINT64_MAX,
2181 0x8000000000000001};
2182 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2183
2184 // z2_inputs[...] + number of active S lanes (3)
2185 uint32_t z6_expected[] =
2186 {0x12340045, 3, UINT32_MAX, 4, 0x80000002, 0x80000003};
2187 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2188
2189 // z3_inputs[...] + number of active H lanes (5)
2190 uint16_t z7_expected[] = {0x122f, 5, 6, UINT16_MAX, 0x8005, 0x8004};
2191 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2192
2193 // Check that the non-destructive macros produced the same results.
2194 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2195 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2196 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2197 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2198 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2199 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2200 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2201 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2202 }
2203}
2204
Jacob Bramleye8289202019-07-31 11:25:23 +01002205TEST_SVE(sve_uqinc_uqdec_ptrue_vector) {
2206 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002207 START();
2208
2209 // With an all-true predicate, these instructions increment or decrement by
2210 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01002211 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002212
2213 __ Mov(x0, 0x1234567800000000);
2214 __ Mov(x1, 0x12340000);
2215 __ Mov(x2, 0x1200);
2216
2217 __ Dup(z0.VnD(), x0);
2218 __ Uqdecp(z0.VnD(), p15);
2219
2220 __ Dup(z1.VnS(), x1);
2221 __ Uqdecp(z1.VnS(), p15);
2222
2223 __ Dup(z2.VnH(), x2);
2224 __ Uqdecp(z2.VnH(), p15);
2225
2226 __ Dup(z3.VnD(), x0);
2227 __ Uqincp(z3.VnD(), p15);
2228
2229 __ Dup(z4.VnS(), x1);
2230 __ Uqincp(z4.VnS(), p15);
2231
2232 __ Dup(z5.VnH(), x2);
2233 __ Uqincp(z5.VnH(), p15);
2234
2235 END();
2236 if (CAN_RUN()) {
2237 RUN();
2238
2239 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2240 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2241 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2242
2243 for (int i = 0; i < d_lane_count; i++) {
2244 ASSERT_EQUAL_SVE_LANE(0x1234567800000000 - d_lane_count, z0.VnD(), i);
2245 ASSERT_EQUAL_SVE_LANE(0x1234567800000000 + d_lane_count, z3.VnD(), i);
2246 }
2247
2248 for (int i = 0; i < s_lane_count; i++) {
2249 ASSERT_EQUAL_SVE_LANE(0x12340000 - s_lane_count, z1.VnS(), i);
2250 ASSERT_EQUAL_SVE_LANE(0x12340000 + s_lane_count, z4.VnS(), i);
2251 }
2252
2253 for (int i = 0; i < h_lane_count; i++) {
2254 ASSERT_EQUAL_SVE_LANE(0x1200 - h_lane_count, z2.VnH(), i);
2255 ASSERT_EQUAL_SVE_LANE(0x1200 + h_lane_count, z5.VnH(), i);
2256 }
2257 }
2258}
2259
Jacob Bramleye8289202019-07-31 11:25:23 +01002260TEST_SVE(sve_index) {
2261 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleycd8148c2019-07-11 18:43:20 +01002262 START();
2263
2264 // Simple cases.
2265 __ Index(z0.VnB(), 0, 1);
2266 __ Index(z1.VnH(), 1, 1);
2267 __ Index(z2.VnS(), 2, 1);
2268 __ Index(z3.VnD(), 3, 1);
2269
2270 // Synthesised immediates.
2271 __ Index(z4.VnB(), 42, -1);
2272 __ Index(z5.VnH(), -1, 42);
2273 __ Index(z6.VnS(), 42, 42);
2274
2275 // Register arguments.
2276 __ Mov(x0, 42);
2277 __ Mov(x1, -3);
2278 __ Index(z10.VnD(), x0, x1);
2279 __ Index(z11.VnB(), w0, w1);
2280 // The register size should correspond to the lane size, but VIXL allows any
2281 // register at least as big as the lane size.
2282 __ Index(z12.VnB(), x0, x1);
2283 __ Index(z13.VnH(), w0, x1);
2284 __ Index(z14.VnS(), x0, w1);
2285
2286 // Integer overflow.
2287 __ Index(z20.VnB(), UINT8_MAX - 2, 2);
2288 __ Index(z21.VnH(), 7, -3);
2289 __ Index(z22.VnS(), INT32_MAX - 2, 1);
2290 __ Index(z23.VnD(), INT64_MIN + 6, -7);
2291
2292 END();
2293
2294 if (CAN_RUN()) {
2295 RUN();
2296
2297 int b_lane_count = core.GetSVELaneCount(kBRegSize);
2298 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2299 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2300 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2301
2302 uint64_t b_mask = GetUintMask(kBRegSize);
2303 uint64_t h_mask = GetUintMask(kHRegSize);
2304 uint64_t s_mask = GetUintMask(kSRegSize);
2305 uint64_t d_mask = GetUintMask(kDRegSize);
2306
2307 // Simple cases.
2308 for (int i = 0; i < b_lane_count; i++) {
2309 ASSERT_EQUAL_SVE_LANE((0 + i) & b_mask, z0.VnB(), i);
2310 }
2311 for (int i = 0; i < h_lane_count; i++) {
2312 ASSERT_EQUAL_SVE_LANE((1 + i) & h_mask, z1.VnH(), i);
2313 }
2314 for (int i = 0; i < s_lane_count; i++) {
2315 ASSERT_EQUAL_SVE_LANE((2 + i) & s_mask, z2.VnS(), i);
2316 }
2317 for (int i = 0; i < d_lane_count; i++) {
2318 ASSERT_EQUAL_SVE_LANE((3 + i) & d_mask, z3.VnD(), i);
2319 }
2320
2321 // Synthesised immediates.
2322 for (int i = 0; i < b_lane_count; i++) {
2323 ASSERT_EQUAL_SVE_LANE((42 - i) & b_mask, z4.VnB(), i);
2324 }
2325 for (int i = 0; i < h_lane_count; i++) {
2326 ASSERT_EQUAL_SVE_LANE((-1 + (42 * i)) & h_mask, z5.VnH(), i);
2327 }
2328 for (int i = 0; i < s_lane_count; i++) {
2329 ASSERT_EQUAL_SVE_LANE((42 + (42 * i)) & s_mask, z6.VnS(), i);
2330 }
2331
2332 // Register arguments.
2333 for (int i = 0; i < d_lane_count; i++) {
2334 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & d_mask, z10.VnD(), i);
2335 }
2336 for (int i = 0; i < b_lane_count; i++) {
2337 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z11.VnB(), i);
2338 }
2339 for (int i = 0; i < b_lane_count; i++) {
2340 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z12.VnB(), i);
2341 }
2342 for (int i = 0; i < h_lane_count; i++) {
2343 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & h_mask, z13.VnH(), i);
2344 }
2345 for (int i = 0; i < s_lane_count; i++) {
2346 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & s_mask, z14.VnS(), i);
2347 }
2348
2349 // Integer overflow.
2350 uint8_t expected_z20[] = {0x05, 0x03, 0x01, 0xff, 0xfd};
2351 ASSERT_EQUAL_SVE(expected_z20, z20.VnB());
2352 uint16_t expected_z21[] = {0xfffb, 0xfffe, 0x0001, 0x0004, 0x0007};
2353 ASSERT_EQUAL_SVE(expected_z21, z21.VnH());
2354 uint32_t expected_z22[] = {0x80000000, 0x7fffffff, 0x7ffffffe, 0x7ffffffd};
2355 ASSERT_EQUAL_SVE(expected_z22, z22.VnS());
2356 uint64_t expected_z23[] = {0x7fffffffffffffff, 0x8000000000000006};
2357 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
2358 }
2359}
2360
TatWai Chongc844bb22019-06-10 15:32:53 -07002361TEST(sve_int_compare_count_and_limit_scalars) {
2362 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2363 START();
2364
2365 __ Mov(w20, 0xfffffffd);
2366 __ Mov(w21, 0xffffffff);
2367
2368 __ Whilele(p0.VnB(), w20, w21);
2369 __ Mrs(x0, NZCV);
2370 __ Whilele(p1.VnH(), w20, w21);
2371 __ Mrs(x1, NZCV);
2372
2373 __ Mov(w20, 0xffffffff);
2374 __ Mov(w21, 0x00000000);
2375
2376 __ Whilelt(p2.VnS(), w20, w21);
2377 __ Mrs(x2, NZCV);
2378 __ Whilelt(p3.VnD(), w20, w21);
2379 __ Mrs(x3, NZCV);
2380
2381 __ Mov(w20, 0xfffffffd);
2382 __ Mov(w21, 0xffffffff);
2383
2384 __ Whilels(p4.VnB(), w20, w21);
2385 __ Mrs(x4, NZCV);
2386 __ Whilels(p5.VnH(), w20, w21);
2387 __ Mrs(x5, NZCV);
2388
2389 __ Mov(w20, 0xffffffff);
2390 __ Mov(w21, 0x00000000);
2391
2392 __ Whilelo(p6.VnS(), w20, w21);
2393 __ Mrs(x6, NZCV);
2394 __ Whilelo(p7.VnD(), w20, w21);
2395 __ Mrs(x7, NZCV);
2396
2397 __ Mov(x20, 0xfffffffffffffffd);
2398 __ Mov(x21, 0xffffffffffffffff);
2399
2400 __ Whilele(p8.VnB(), x20, x21);
2401 __ Mrs(x8, NZCV);
2402 __ Whilele(p9.VnH(), x20, x21);
2403 __ Mrs(x9, NZCV);
2404
2405 __ Mov(x20, 0xffffffffffffffff);
2406 __ Mov(x21, 0x0000000000000000);
2407
2408 __ Whilelt(p10.VnS(), x20, x21);
2409 __ Mrs(x10, NZCV);
2410 __ Whilelt(p11.VnD(), x20, x21);
2411 __ Mrs(x11, NZCV);
2412
2413 __ Mov(x20, 0xfffffffffffffffd);
2414 __ Mov(x21, 0xffffffffffffffff);
2415
2416 __ Whilels(p12.VnB(), x20, x21);
2417 __ Mrs(x12, NZCV);
2418 __ Whilels(p13.VnH(), x20, x21);
2419 __ Mrs(x13, NZCV);
2420
2421 __ Mov(x20, 0xffffffffffffffff);
2422 __ Mov(x21, 0x0000000000000000);
2423
2424 __ Whilelo(p14.VnS(), x20, x21);
2425 __ Mrs(x14, NZCV);
2426 __ Whilelo(p15.VnD(), x20, x21);
2427 __ Mrs(x15, NZCV);
2428
2429 END();
2430
2431 if (CAN_RUN()) {
2432 RUN();
2433
2434 // 0b...00000000'00000111
2435 int p0_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2436 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
2437
2438 // 0b...00000000'00010101
2439 int p1_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2440 ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
2441
2442 int p2_expected[] = {0x0, 0x0, 0x0, 0x1};
2443 ASSERT_EQUAL_SVE(p2_expected, p2.VnS());
2444
2445 int p3_expected[] = {0x00, 0x01};
2446 ASSERT_EQUAL_SVE(p3_expected, p3.VnD());
2447
2448 // 0b...11111111'11111111
2449 int p4_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2450 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
2451
2452 // 0b...01010101'01010101
2453 int p5_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2454 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2455
2456 int p6_expected[] = {0x0, 0x0, 0x0, 0x0};
2457 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2458
2459 int p7_expected[] = {0x00, 0x00};
2460 ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
2461
2462 // 0b...00000000'00000111
2463 int p8_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2464 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
2465
2466 // 0b...00000000'00010101
2467 int p9_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2468 ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
2469
2470 int p10_expected[] = {0x0, 0x0, 0x0, 0x1};
2471 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
2472
2473 int p11_expected[] = {0x00, 0x01};
2474 ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2475
2476 // 0b...11111111'11111111
2477 int p12_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2478 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
2479
2480 // 0b...01010101'01010101
2481 int p13_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2482 ASSERT_EQUAL_SVE(p13_expected, p13.VnH());
2483
2484 int p14_expected[] = {0x0, 0x0, 0x0, 0x0};
2485 ASSERT_EQUAL_SVE(p14_expected, p14.VnS());
2486
2487 int p15_expected[] = {0x00, 0x00};
2488 ASSERT_EQUAL_SVE(p15_expected, p15.VnD());
2489
2490 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w0);
2491 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w1);
2492 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w2);
2493 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w3);
2494 ASSERT_EQUAL_32(SVEFirstFlag, w4);
2495 ASSERT_EQUAL_32(SVEFirstFlag, w5);
2496 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w6);
2497 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w7);
2498 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w8);
2499 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w9);
2500 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
2501 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w11);
2502 ASSERT_EQUAL_32(SVEFirstFlag, w12);
2503 ASSERT_EQUAL_32(SVEFirstFlag, w13);
2504 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w14);
2505 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w15);
2506 }
2507}
2508
TatWai Chong302729c2019-06-14 16:18:51 -07002509TEST(sve_int_compare_vectors_signed_imm) {
2510 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2511 START();
2512
2513 int z13_inputs[] = {0, 1, -1, -15, 126, -127, -126, -15};
2514 int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 1, 1};
2515 InsrHelper(&masm, z13.VnB(), z13_inputs);
2516 Initialise(&masm, p0.VnB(), mask_inputs1);
2517
2518 __ Cmpeq(p2.VnB(), p0.Zeroing(), z13.VnB(), -15);
2519 __ Mrs(x2, NZCV);
2520 __ Cmpeq(p3.VnB(), p0.Zeroing(), z13.VnB(), -127);
2521
2522 int z14_inputs[] = {0, 1, -1, -32767, -32766, 32767, 32766, 0};
2523 int mask_inputs2[] = {1, 1, 1, 0, 1, 1, 1, 1};
2524 InsrHelper(&masm, z14.VnH(), z14_inputs);
2525 Initialise(&masm, p0.VnH(), mask_inputs2);
2526
2527 __ Cmpge(p4.VnH(), p0.Zeroing(), z14.VnH(), -1);
2528 __ Mrs(x4, NZCV);
2529 __ Cmpge(p5.VnH(), p0.Zeroing(), z14.VnH(), -32767);
2530
2531 int z15_inputs[] = {0, 1, -1, INT_MIN};
2532 int mask_inputs3[] = {0, 1, 1, 1};
2533 InsrHelper(&masm, z15.VnS(), z15_inputs);
2534 Initialise(&masm, p0.VnS(), mask_inputs3);
2535
2536 __ Cmpgt(p6.VnS(), p0.Zeroing(), z15.VnS(), 0);
2537 __ Mrs(x6, NZCV);
2538 __ Cmpgt(p7.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2539
2540 __ Cmplt(p8.VnS(), p0.Zeroing(), z15.VnS(), 0);
2541 __ Mrs(x8, NZCV);
2542 __ Cmplt(p9.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2543
2544 int64_t z16_inputs[] = {0, -1};
2545 int mask_inputs4[] = {1, 1};
2546 InsrHelper(&masm, z16.VnD(), z16_inputs);
2547 Initialise(&masm, p0.VnD(), mask_inputs4);
2548
2549 __ Cmple(p10.VnD(), p0.Zeroing(), z16.VnD(), -1);
2550 __ Mrs(x10, NZCV);
2551 __ Cmple(p11.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MIN);
2552
2553 __ Cmpne(p12.VnD(), p0.Zeroing(), z16.VnD(), -1);
2554 __ Mrs(x12, NZCV);
2555 __ Cmpne(p13.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MAX);
2556
2557 END();
2558
2559 if (CAN_RUN()) {
2560 RUN();
2561
2562 int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1};
2563 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2564
2565 int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 0};
2566 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2567
2568 int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1, 0x1};
2569 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2570
2571 int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1};
2572 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2573
2574 int p6_expected[] = {0x0, 0x1, 0x0, 0x0};
2575 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2576
2577 int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
2578 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2579
2580 int p8_expected[] = {0x0, 0x0, 0x1, 0x1};
2581 ASSERT_EQUAL_SVE(p8_expected, p8.VnS());
2582
2583 int p9_expected[] = {0x0, 0x0, 0x0, 0x1};
2584 ASSERT_EQUAL_SVE(p9_expected, p9.VnS());
2585
2586 int p10_expected[] = {0x00, 0x01};
2587 ASSERT_EQUAL_SVE(p10_expected, p10.VnD());
2588
2589 int p11_expected[] = {0x00, 0x00};
2590 ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2591
2592 int p12_expected[] = {0x01, 0x00};
2593 ASSERT_EQUAL_SVE(p12_expected, p12.VnD());
2594
2595 int p13_expected[] = {0x01, 0x01};
2596 ASSERT_EQUAL_SVE(p13_expected, p13.VnD());
2597
2598 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w2);
2599 ASSERT_EQUAL_32(SVEFirstFlag, w4);
2600 ASSERT_EQUAL_32(NoFlag, w6);
2601 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2602 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w10);
2603 ASSERT_EQUAL_32(NoFlag, w12);
2604 }
2605}
2606
2607TEST(sve_int_compare_vectors_unsigned_imm) {
2608 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2609 START();
2610
2611 uint32_t src1_inputs[] = {0xf7, 0x0f, 0x8f, 0x1f, 0x83, 0x12, 0x00, 0xf1};
2612 int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 0, 1};
2613 InsrHelper(&masm, z13.VnB(), src1_inputs);
2614 Initialise(&masm, p0.VnB(), mask_inputs1);
2615
2616 __ Cmphi(p2.VnB(), p0.Zeroing(), z13.VnB(), 0x0f);
2617 __ Mrs(x2, NZCV);
2618 __ Cmphi(p3.VnB(), p0.Zeroing(), z13.VnB(), 0xf0);
2619
2620 uint32_t src2_inputs[] = {0xffff, 0x8000, 0x1fff, 0x0000, 0x1234};
2621 int mask_inputs2[] = {1, 1, 1, 1, 0};
2622 InsrHelper(&masm, z13.VnH(), src2_inputs);
2623 Initialise(&masm, p0.VnH(), mask_inputs2);
2624
2625 __ Cmphs(p4.VnH(), p0.Zeroing(), z13.VnH(), 0x1f);
2626 __ Mrs(x4, NZCV);
2627 __ Cmphs(p5.VnH(), p0.Zeroing(), z13.VnH(), 0x1fff);
2628
2629 uint32_t src3_inputs[] = {0xffffffff, 0xfedcba98, 0x0000ffff, 0x00000000};
2630 int mask_inputs3[] = {1, 1, 1, 1};
2631 InsrHelper(&masm, z13.VnS(), src3_inputs);
2632 Initialise(&masm, p0.VnS(), mask_inputs3);
2633
2634 __ Cmplo(p6.VnS(), p0.Zeroing(), z13.VnS(), 0x3f);
2635 __ Mrs(x6, NZCV);
2636 __ Cmplo(p7.VnS(), p0.Zeroing(), z13.VnS(), 0x3f3f3f3f);
2637
2638 uint64_t src4_inputs[] = {0xffffffffffffffff, 0x0000000000000000};
2639 int mask_inputs4[] = {1, 1};
2640 InsrHelper(&masm, z13.VnD(), src4_inputs);
2641 Initialise(&masm, p0.VnD(), mask_inputs4);
2642
2643 __ Cmpls(p8.VnD(), p0.Zeroing(), z13.VnD(), 0x2f);
2644 __ Mrs(x8, NZCV);
2645 __ Cmpls(p9.VnD(), p0.Zeroing(), z13.VnD(), 0x800000000000000);
2646
2647 END();
2648
2649 if (CAN_RUN()) {
2650 RUN();
2651
2652 int p2_expected[] = {1, 0, 1, 0, 1, 1, 0, 1};
2653 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2654
2655 int p3_expected[] = {1, 0, 0, 0, 0, 0, 0, 1};
2656 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2657
2658 int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2659 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2660
2661 int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2662 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2663
2664 int p6_expected[] = {0x0, 0x0, 0x0, 0x1};
2665 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2666
2667 int p7_expected[] = {0x0, 0x0, 0x1, 0x1};
2668 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2669
2670 int p8_expected[] = {0x00, 0x01};
2671 ASSERT_EQUAL_SVE(p8_expected, p8.VnD());
2672
2673 int p9_expected[] = {0x00, 0x01};
2674 ASSERT_EQUAL_SVE(p9_expected, p9.VnD());
2675
2676 ASSERT_EQUAL_32(SVEFirstFlag, w2);
2677 ASSERT_EQUAL_32(NoFlag, w4);
2678 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w6);
2679 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2680 }
2681}
2682
TatWai Chongc844bb22019-06-10 15:32:53 -07002683TEST(sve_int_compare_conditionally_terminate_scalars) {
2684 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2685 START();
2686
2687 __ Mov(x0, 0xfedcba9887654321);
2688 __ Mov(x1, 0x1000100010001000);
2689
Jacob Bramleyb40aa692019-10-07 19:24:29 +01002690 // Initialise Z and C. These are preserved by cterm*, and the V flag is set to
2691 // !C if the condition does not hold.
2692 __ Mov(x10, NoFlag);
2693 __ Msr(NZCV, x10);
2694
TatWai Chongc844bb22019-06-10 15:32:53 -07002695 __ Ctermeq(w0, w0);
2696 __ Mrs(x2, NZCV);
2697 __ Ctermeq(x0, x1);
2698 __ Mrs(x3, NZCV);
2699 __ Ctermne(x0, x0);
2700 __ Mrs(x4, NZCV);
2701 __ Ctermne(w0, w1);
2702 __ Mrs(x5, NZCV);
2703
Jacob Bramleyb40aa692019-10-07 19:24:29 +01002704 // As above, but with all flags initially set.
2705 __ Mov(x10, NZCVFlag);
2706 __ Msr(NZCV, x10);
2707
2708 __ Ctermeq(w0, w0);
2709 __ Mrs(x6, NZCV);
2710 __ Ctermeq(x0, x1);
2711 __ Mrs(x7, NZCV);
2712 __ Ctermne(x0, x0);
2713 __ Mrs(x8, NZCV);
2714 __ Ctermne(w0, w1);
2715 __ Mrs(x9, NZCV);
2716
TatWai Chongc844bb22019-06-10 15:32:53 -07002717 END();
2718
2719 if (CAN_RUN()) {
2720 RUN();
2721
2722 ASSERT_EQUAL_32(SVEFirstFlag, w2);
2723 ASSERT_EQUAL_32(VFlag, w3);
2724 ASSERT_EQUAL_32(VFlag, w4);
2725 ASSERT_EQUAL_32(SVEFirstFlag, w5);
Jacob Bramleyb40aa692019-10-07 19:24:29 +01002726
2727 ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w6);
2728 ASSERT_EQUAL_32(ZCFlag, w7);
2729 ASSERT_EQUAL_32(ZCFlag, w8);
2730 ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w9);
TatWai Chongc844bb22019-06-10 15:32:53 -07002731 }
2732}
2733
Jacob Bramley0ce75842019-07-17 18:12:50 +01002734// Work out what the architectural `PredTest` pseudocode should produce for the
2735// given result and governing predicate.
2736template <typename Tg, typename Td, int N>
2737static StatusFlags GetPredTestFlags(const Td (&pd)[N],
2738 const Tg (&pg)[N],
2739 int vl) {
2740 int first = -1;
2741 int last = -1;
2742 bool any_active = false;
2743
2744 // Only consider potentially-active lanes.
2745 int start = (N > vl) ? (N - vl) : 0;
2746 for (int i = start; i < N; i++) {
2747 if ((pg[i] & 1) == 1) {
2748 // Look for the first and last active lanes.
2749 // Note that the 'first' lane is the one with the highest index.
2750 if (last < 0) last = i;
2751 first = i;
2752 // Look for any active lanes that are also active in pd.
2753 if ((pd[i] & 1) == 1) any_active = true;
2754 }
2755 }
2756
2757 uint32_t flags = 0;
2758 if ((first >= 0) && ((pd[first] & 1) == 1)) flags |= SVEFirstFlag;
2759 if (!any_active) flags |= SVENoneFlag;
2760 if ((last < 0) || ((pd[last] & 1) == 0)) flags |= SVENotLastFlag;
2761 return static_cast<StatusFlags>(flags);
2762}
2763
2764typedef void (MacroAssembler::*PfirstPnextFn)(const PRegisterWithLaneSize& pd,
2765 const PRegister& pg,
2766 const PRegisterWithLaneSize& pn);
2767template <typename Tg, typename Tn, typename Td>
Jacob Bramleye8289202019-07-31 11:25:23 +01002768static void PfirstPnextHelper(Test* config,
2769 PfirstPnextFn macro,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002770 unsigned lane_size_in_bits,
2771 const Tg& pg_inputs,
2772 const Tn& pn_inputs,
2773 const Td& pd_expected) {
Jacob Bramleye8289202019-07-31 11:25:23 +01002774 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002775 START();
2776
2777 PRegister pg = p15;
2778 PRegister pn = p14;
2779 Initialise(&masm, pg.WithLaneSize(lane_size_in_bits), pg_inputs);
2780 Initialise(&masm, pn.WithLaneSize(lane_size_in_bits), pn_inputs);
2781
2782 // Initialise NZCV to an impossible value, to check that we actually write it.
2783 __ Mov(x10, NZCVFlag);
2784
2785 // If pd.Is(pn), the MacroAssembler simply passes the arguments directly to
2786 // the Assembler.
2787 __ Msr(NZCV, x10);
2788 __ Mov(p0, pn);
2789 (masm.*macro)(p0.WithLaneSize(lane_size_in_bits),
2790 pg,
2791 p0.WithLaneSize(lane_size_in_bits));
2792 __ Mrs(x0, NZCV);
2793
2794 // The MacroAssembler supports non-destructive use.
2795 __ Msr(NZCV, x10);
2796 (masm.*macro)(p1.WithLaneSize(lane_size_in_bits),
2797 pg,
2798 pn.WithLaneSize(lane_size_in_bits));
2799 __ Mrs(x1, NZCV);
2800
2801 // If pd.Aliases(pg) the macro requires a scratch register.
2802 {
2803 UseScratchRegisterScope temps(&masm);
2804 temps.Include(p13);
2805 __ Msr(NZCV, x10);
2806 __ Mov(p2, p15);
2807 (masm.*macro)(p2.WithLaneSize(lane_size_in_bits),
2808 p2,
2809 pn.WithLaneSize(lane_size_in_bits));
2810 __ Mrs(x2, NZCV);
2811 }
2812
2813 END();
2814
2815 if (CAN_RUN()) {
2816 RUN();
2817
2818 // Check that the inputs weren't modified.
2819 ASSERT_EQUAL_SVE(pn_inputs, pn.WithLaneSize(lane_size_in_bits));
2820 ASSERT_EQUAL_SVE(pg_inputs, pg.WithLaneSize(lane_size_in_bits));
2821
2822 // Check the primary operation.
2823 ASSERT_EQUAL_SVE(pd_expected, p0.WithLaneSize(lane_size_in_bits));
2824 ASSERT_EQUAL_SVE(pd_expected, p1.WithLaneSize(lane_size_in_bits));
2825 ASSERT_EQUAL_SVE(pd_expected, p2.WithLaneSize(lane_size_in_bits));
2826
2827 // Check that the flags were properly set.
2828 StatusFlags nzcv_expected =
2829 GetPredTestFlags(pd_expected,
2830 pg_inputs,
2831 core.GetSVELaneCount(kBRegSize));
2832 ASSERT_EQUAL_64(nzcv_expected, x0);
2833 ASSERT_EQUAL_64(nzcv_expected, x1);
2834 ASSERT_EQUAL_64(nzcv_expected, x2);
2835 }
2836}
2837
2838template <typename Tg, typename Tn, typename Td>
Jacob Bramleye8289202019-07-31 11:25:23 +01002839static void PfirstHelper(Test* config,
2840 const Tg& pg_inputs,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002841 const Tn& pn_inputs,
2842 const Td& pd_expected) {
Jacob Bramleye8289202019-07-31 11:25:23 +01002843 PfirstPnextHelper(config,
2844 &MacroAssembler::Pfirst,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002845 kBRegSize, // pfirst only accepts B-sized lanes.
2846 pg_inputs,
2847 pn_inputs,
2848 pd_expected);
2849}
2850
2851template <typename Tg, typename Tn, typename Td>
Jacob Bramleye8289202019-07-31 11:25:23 +01002852static void PnextHelper(Test* config,
2853 unsigned lane_size_in_bits,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002854 const Tg& pg_inputs,
2855 const Tn& pn_inputs,
2856 const Td& pd_expected) {
Jacob Bramleye8289202019-07-31 11:25:23 +01002857 PfirstPnextHelper(config,
2858 &MacroAssembler::Pnext,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002859 lane_size_in_bits,
2860 pg_inputs,
2861 pn_inputs,
2862 pd_expected);
2863}
2864
Jacob Bramleye8289202019-07-31 11:25:23 +01002865TEST_SVE(sve_pfirst) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01002866 // Provide more lanes than kPRegMinSize (to check propagation if we have a
2867 // large VL), but few enough to make the test easy to read.
2868 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2869 int in1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2870 int in2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2871 int in3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2872 int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2873 VIXL_ASSERT(ArrayLength(in0) > kPRegMinSize);
2874
2875 // Pfirst finds the first active lane in pg, and activates the corresponding
2876 // lane in pn (if it isn't already active).
2877
2878 // The first active lane in in1 is here. |
2879 // v
2880 int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
2881 int exp12[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0};
2882 int exp13[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2883 int exp14[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
Jacob Bramleye8289202019-07-31 11:25:23 +01002884 PfirstHelper(config, in1, in0, exp10);
2885 PfirstHelper(config, in1, in2, exp12);
2886 PfirstHelper(config, in1, in3, exp13);
2887 PfirstHelper(config, in1, in4, exp14);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002888
2889 // The first active lane in in2 is here. |
2890 // v
2891 int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
2892 int exp21[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0};
2893 int exp23[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2894 int exp24[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
Jacob Bramleye8289202019-07-31 11:25:23 +01002895 PfirstHelper(config, in2, in0, exp20);
2896 PfirstHelper(config, in2, in1, exp21);
2897 PfirstHelper(config, in2, in3, exp23);
2898 PfirstHelper(config, in2, in4, exp24);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002899
2900 // The first active lane in in3 is here. |
2901 // v
2902 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
2903 int exp31[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
2904 int exp32[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
2905 int exp34[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
Jacob Bramleye8289202019-07-31 11:25:23 +01002906 PfirstHelper(config, in3, in0, exp30);
2907 PfirstHelper(config, in3, in1, exp31);
2908 PfirstHelper(config, in3, in2, exp32);
2909 PfirstHelper(config, in3, in4, exp34);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002910
2911 // | The first active lane in in4 is here.
2912 // v
2913 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2914 int exp41[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2915 int exp42[] = {1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2916 int exp43[] = {1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
Jacob Bramleye8289202019-07-31 11:25:23 +01002917 PfirstHelper(config, in4, in0, exp40);
2918 PfirstHelper(config, in4, in1, exp41);
2919 PfirstHelper(config, in4, in2, exp42);
2920 PfirstHelper(config, in4, in3, exp43);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002921
2922 // If pg is all inactive, the input is passed through unchanged.
Jacob Bramleye8289202019-07-31 11:25:23 +01002923 PfirstHelper(config, in0, in0, in0);
2924 PfirstHelper(config, in0, in1, in1);
2925 PfirstHelper(config, in0, in2, in2);
2926 PfirstHelper(config, in0, in3, in3);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002927
2928 // If the values of pg and pn match, the value is passed through unchanged.
Jacob Bramleye8289202019-07-31 11:25:23 +01002929 PfirstHelper(config, in0, in0, in0);
2930 PfirstHelper(config, in1, in1, in1);
2931 PfirstHelper(config, in2, in2, in2);
2932 PfirstHelper(config, in3, in3, in3);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002933}
2934
Jacob Bramleye8289202019-07-31 11:25:23 +01002935TEST_SVE(sve_pfirst_alias) {
2936 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002937 START();
2938
2939 // Check that the Simulator behaves correctly when all arguments are aliased.
2940 int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
2941 int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
2942 int in_s[] = {0, 1, 1, 0};
2943 int in_d[] = {1, 1};
2944
2945 Initialise(&masm, p0.VnB(), in_b);
2946 Initialise(&masm, p1.VnH(), in_h);
2947 Initialise(&masm, p2.VnS(), in_s);
2948 Initialise(&masm, p3.VnD(), in_d);
2949
2950 // Initialise NZCV to an impossible value, to check that we actually write it.
2951 __ Mov(x10, NZCVFlag);
2952
2953 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01002954 __ Pfirst(p0.VnB(), p0, p0.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01002955 __ Mrs(x0, NZCV);
2956
2957 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01002958 __ Pfirst(p1.VnB(), p1, p1.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01002959 __ Mrs(x1, NZCV);
2960
2961 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01002962 __ Pfirst(p2.VnB(), p2, p2.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01002963 __ Mrs(x2, NZCV);
2964
2965 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01002966 __ Pfirst(p3.VnB(), p3, p3.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01002967 __ Mrs(x3, NZCV);
2968
2969 END();
2970
2971 if (CAN_RUN()) {
2972 RUN();
2973
2974 // The first lane from pg is already active in pdn, so the P register should
2975 // be unchanged.
2976 ASSERT_EQUAL_SVE(in_b, p0.VnB());
2977 ASSERT_EQUAL_SVE(in_h, p1.VnH());
2978 ASSERT_EQUAL_SVE(in_s, p2.VnS());
2979 ASSERT_EQUAL_SVE(in_d, p3.VnD());
2980
2981 ASSERT_EQUAL_64(SVEFirstFlag, x0);
2982 ASSERT_EQUAL_64(SVEFirstFlag, x1);
2983 ASSERT_EQUAL_64(SVEFirstFlag, x2);
2984 ASSERT_EQUAL_64(SVEFirstFlag, x3);
2985 }
2986}
2987
Jacob Bramleye8289202019-07-31 11:25:23 +01002988TEST_SVE(sve_pnext_b) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01002989 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
2990 // (to check propagation if we have a large VL), but few enough to make the
2991 // test easy to read.
2992 // For now, we just use kPRegMinSize so that the test works anywhere.
2993 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2994 int in1[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2995 int in2[] = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2996 int in3[] = {0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1};
2997 int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2998
2999 // Pnext activates the next element that is true in pg, after the last-active
3000 // element in pn. If all pn elements are false (as in in0), it starts looking
3001 // at element 0.
3002
3003 // There are no active lanes in in0, so the result is simply the first active
3004 // lane from pg.
3005 int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3006 int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
3007 int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
3008 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
3009 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3010
3011 // The last active lane in in1 is here. |
3012 // v
3013 int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3014 int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3015 int exp21[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3016 int exp31[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3017 int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3018
3019 // | The last active lane in in2 is here.
3020 // v
3021 int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3022 int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3023 int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3024 int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3025 int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3026
3027 // | The last active lane in in3 is here.
3028 // v
3029 int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3030 int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3031 int exp23[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3032 int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3033 int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3034
3035 // | The last active lane in in4 is here.
3036 // v
3037 int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3038 int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3039 int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3040 int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3041 int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3042
Jacob Bramleye8289202019-07-31 11:25:23 +01003043 PnextHelper(config, kBRegSize, in0, in0, exp00);
3044 PnextHelper(config, kBRegSize, in1, in0, exp10);
3045 PnextHelper(config, kBRegSize, in2, in0, exp20);
3046 PnextHelper(config, kBRegSize, in3, in0, exp30);
3047 PnextHelper(config, kBRegSize, in4, in0, exp40);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003048
Jacob Bramleye8289202019-07-31 11:25:23 +01003049 PnextHelper(config, kBRegSize, in0, in1, exp01);
3050 PnextHelper(config, kBRegSize, in1, in1, exp11);
3051 PnextHelper(config, kBRegSize, in2, in1, exp21);
3052 PnextHelper(config, kBRegSize, in3, in1, exp31);
3053 PnextHelper(config, kBRegSize, in4, in1, exp41);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003054
Jacob Bramleye8289202019-07-31 11:25:23 +01003055 PnextHelper(config, kBRegSize, in0, in2, exp02);
3056 PnextHelper(config, kBRegSize, in1, in2, exp12);
3057 PnextHelper(config, kBRegSize, in2, in2, exp22);
3058 PnextHelper(config, kBRegSize, in3, in2, exp32);
3059 PnextHelper(config, kBRegSize, in4, in2, exp42);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003060
Jacob Bramleye8289202019-07-31 11:25:23 +01003061 PnextHelper(config, kBRegSize, in0, in3, exp03);
3062 PnextHelper(config, kBRegSize, in1, in3, exp13);
3063 PnextHelper(config, kBRegSize, in2, in3, exp23);
3064 PnextHelper(config, kBRegSize, in3, in3, exp33);
3065 PnextHelper(config, kBRegSize, in4, in3, exp43);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003066
Jacob Bramleye8289202019-07-31 11:25:23 +01003067 PnextHelper(config, kBRegSize, in0, in4, exp04);
3068 PnextHelper(config, kBRegSize, in1, in4, exp14);
3069 PnextHelper(config, kBRegSize, in2, in4, exp24);
3070 PnextHelper(config, kBRegSize, in3, in4, exp34);
3071 PnextHelper(config, kBRegSize, in4, in4, exp44);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003072}
3073
Jacob Bramleye8289202019-07-31 11:25:23 +01003074TEST_SVE(sve_pnext_h) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003075 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3076 // (to check propagation if we have a large VL), but few enough to make the
3077 // test easy to read.
3078 // For now, we just use kPRegMinSize so that the test works anywhere.
3079 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0};
3080 int in1[] = {0, 0, 0, 1, 0, 2, 1, 0};
3081 int in2[] = {0, 1, 2, 0, 2, 0, 2, 0};
3082 int in3[] = {0, 0, 0, 3, 0, 0, 0, 3};
3083 int in4[] = {3, 0, 0, 0, 0, 0, 0, 0};
3084
3085 // Pnext activates the next element that is true in pg, after the last-active
3086 // element in pn. If all pn elements are false (as in in0), it starts looking
3087 // at element 0.
3088 //
3089 // As for other SVE instructions, elements are only considered to be active if
3090 // the _first_ bit in each field is one. Other bits are ignored.
3091
3092 // There are no active lanes in in0, so the result is simply the first active
3093 // lane from pg.
3094 int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0};
3095 int exp10[] = {0, 0, 0, 0, 0, 0, 1, 0};
3096 int exp20[] = {0, 1, 0, 0, 0, 0, 0, 0};
3097 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 1};
3098 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0};
3099
3100 // | The last active lane in in1 is here.
3101 // v
3102 int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0};
3103 int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0};
3104 int exp21[] = {0, 1, 0, 0, 0, 0, 0, 0};
3105 int exp31[] = {0, 0, 0, 0, 0, 0, 0, 0};
3106 int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0};
3107
3108 // | The last active lane in in2 is here.
3109 // v
3110 int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0};
3111 int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0};
3112 int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0};
3113 int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0};
3114 int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0};
3115
3116 // | The last active lane in in3 is here.
3117 // v
3118 int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0};
3119 int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0};
3120 int exp23[] = {0, 1, 0, 0, 0, 0, 0, 0};
3121 int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0};
3122 int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0};
3123
3124 // | The last active lane in in4 is here.
3125 // v
3126 int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0};
3127 int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0};
3128 int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0};
3129 int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0};
3130 int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0};
3131
Jacob Bramleye8289202019-07-31 11:25:23 +01003132 PnextHelper(config, kHRegSize, in0, in0, exp00);
3133 PnextHelper(config, kHRegSize, in1, in0, exp10);
3134 PnextHelper(config, kHRegSize, in2, in0, exp20);
3135 PnextHelper(config, kHRegSize, in3, in0, exp30);
3136 PnextHelper(config, kHRegSize, in4, in0, exp40);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003137
Jacob Bramleye8289202019-07-31 11:25:23 +01003138 PnextHelper(config, kHRegSize, in0, in1, exp01);
3139 PnextHelper(config, kHRegSize, in1, in1, exp11);
3140 PnextHelper(config, kHRegSize, in2, in1, exp21);
3141 PnextHelper(config, kHRegSize, in3, in1, exp31);
3142 PnextHelper(config, kHRegSize, in4, in1, exp41);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003143
Jacob Bramleye8289202019-07-31 11:25:23 +01003144 PnextHelper(config, kHRegSize, in0, in2, exp02);
3145 PnextHelper(config, kHRegSize, in1, in2, exp12);
3146 PnextHelper(config, kHRegSize, in2, in2, exp22);
3147 PnextHelper(config, kHRegSize, in3, in2, exp32);
3148 PnextHelper(config, kHRegSize, in4, in2, exp42);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003149
Jacob Bramleye8289202019-07-31 11:25:23 +01003150 PnextHelper(config, kHRegSize, in0, in3, exp03);
3151 PnextHelper(config, kHRegSize, in1, in3, exp13);
3152 PnextHelper(config, kHRegSize, in2, in3, exp23);
3153 PnextHelper(config, kHRegSize, in3, in3, exp33);
3154 PnextHelper(config, kHRegSize, in4, in3, exp43);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003155
Jacob Bramleye8289202019-07-31 11:25:23 +01003156 PnextHelper(config, kHRegSize, in0, in4, exp04);
3157 PnextHelper(config, kHRegSize, in1, in4, exp14);
3158 PnextHelper(config, kHRegSize, in2, in4, exp24);
3159 PnextHelper(config, kHRegSize, in3, in4, exp34);
3160 PnextHelper(config, kHRegSize, in4, in4, exp44);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003161}
3162
Jacob Bramleye8289202019-07-31 11:25:23 +01003163TEST_SVE(sve_pnext_s) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003164 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3165 // (to check propagation if we have a large VL), but few enough to make the
3166 // test easy to read.
3167 // For now, we just use kPRegMinSize so that the test works anywhere.
3168 int in0[] = {0xe, 0xc, 0x8, 0x0};
3169 int in1[] = {0x0, 0x2, 0x0, 0x1};
3170 int in2[] = {0x0, 0x1, 0xf, 0x0};
3171 int in3[] = {0xf, 0x0, 0x0, 0x0};
3172
3173 // Pnext activates the next element that is true in pg, after the last-active
3174 // element in pn. If all pn elements are false (as in in0), it starts looking
3175 // at element 0.
3176 //
3177 // As for other SVE instructions, elements are only considered to be active if
3178 // the _first_ bit in each field is one. Other bits are ignored.
3179
3180 // There are no active lanes in in0, so the result is simply the first active
3181 // lane from pg.
3182 int exp00[] = {0, 0, 0, 0};
3183 int exp10[] = {0, 0, 0, 1};
3184 int exp20[] = {0, 0, 1, 0};
3185 int exp30[] = {1, 0, 0, 0};
3186
3187 // | The last active lane in in1 is here.
3188 // v
3189 int exp01[] = {0, 0, 0, 0};
3190 int exp11[] = {0, 0, 0, 0};
3191 int exp21[] = {0, 0, 1, 0};
3192 int exp31[] = {1, 0, 0, 0};
3193
3194 // | The last active lane in in2 is here.
3195 // v
3196 int exp02[] = {0, 0, 0, 0};
3197 int exp12[] = {0, 0, 0, 0};
3198 int exp22[] = {0, 0, 0, 0};
3199 int exp32[] = {1, 0, 0, 0};
3200
3201 // | The last active lane in in3 is here.
3202 // v
3203 int exp03[] = {0, 0, 0, 0};
3204 int exp13[] = {0, 0, 0, 0};
3205 int exp23[] = {0, 0, 0, 0};
3206 int exp33[] = {0, 0, 0, 0};
3207
Jacob Bramleye8289202019-07-31 11:25:23 +01003208 PnextHelper(config, kSRegSize, in0, in0, exp00);
3209 PnextHelper(config, kSRegSize, in1, in0, exp10);
3210 PnextHelper(config, kSRegSize, in2, in0, exp20);
3211 PnextHelper(config, kSRegSize, in3, in0, exp30);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003212
Jacob Bramleye8289202019-07-31 11:25:23 +01003213 PnextHelper(config, kSRegSize, in0, in1, exp01);
3214 PnextHelper(config, kSRegSize, in1, in1, exp11);
3215 PnextHelper(config, kSRegSize, in2, in1, exp21);
3216 PnextHelper(config, kSRegSize, in3, in1, exp31);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003217
Jacob Bramleye8289202019-07-31 11:25:23 +01003218 PnextHelper(config, kSRegSize, in0, in2, exp02);
3219 PnextHelper(config, kSRegSize, in1, in2, exp12);
3220 PnextHelper(config, kSRegSize, in2, in2, exp22);
3221 PnextHelper(config, kSRegSize, in3, in2, exp32);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003222
Jacob Bramleye8289202019-07-31 11:25:23 +01003223 PnextHelper(config, kSRegSize, in0, in3, exp03);
3224 PnextHelper(config, kSRegSize, in1, in3, exp13);
3225 PnextHelper(config, kSRegSize, in2, in3, exp23);
3226 PnextHelper(config, kSRegSize, in3, in3, exp33);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003227}
3228
Jacob Bramleye8289202019-07-31 11:25:23 +01003229TEST_SVE(sve_pnext_d) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003230 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3231 // (to check propagation if we have a large VL), but few enough to make the
3232 // test easy to read.
3233 // For now, we just use kPRegMinSize so that the test works anywhere.
3234 int in0[] = {0xfe, 0xf0};
3235 int in1[] = {0x00, 0x55};
3236 int in2[] = {0x33, 0xff};
3237
3238 // Pnext activates the next element that is true in pg, after the last-active
3239 // element in pn. If all pn elements are false (as in in0), it starts looking
3240 // at element 0.
3241 //
3242 // As for other SVE instructions, elements are only considered to be active if
3243 // the _first_ bit in each field is one. Other bits are ignored.
3244
3245 // There are no active lanes in in0, so the result is simply the first active
3246 // lane from pg.
3247 int exp00[] = {0, 0};
3248 int exp10[] = {0, 1};
3249 int exp20[] = {0, 1};
3250
3251 // | The last active lane in in1 is here.
3252 // v
3253 int exp01[] = {0, 0};
3254 int exp11[] = {0, 0};
3255 int exp21[] = {1, 0};
3256
3257 // | The last active lane in in2 is here.
3258 // v
3259 int exp02[] = {0, 0};
3260 int exp12[] = {0, 0};
3261 int exp22[] = {0, 0};
3262
Jacob Bramleye8289202019-07-31 11:25:23 +01003263 PnextHelper(config, kDRegSize, in0, in0, exp00);
3264 PnextHelper(config, kDRegSize, in1, in0, exp10);
3265 PnextHelper(config, kDRegSize, in2, in0, exp20);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003266
Jacob Bramleye8289202019-07-31 11:25:23 +01003267 PnextHelper(config, kDRegSize, in0, in1, exp01);
3268 PnextHelper(config, kDRegSize, in1, in1, exp11);
3269 PnextHelper(config, kDRegSize, in2, in1, exp21);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003270
Jacob Bramleye8289202019-07-31 11:25:23 +01003271 PnextHelper(config, kDRegSize, in0, in2, exp02);
3272 PnextHelper(config, kDRegSize, in1, in2, exp12);
3273 PnextHelper(config, kDRegSize, in2, in2, exp22);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003274}
3275
Jacob Bramleye8289202019-07-31 11:25:23 +01003276TEST_SVE(sve_pnext_alias) {
3277 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003278 START();
3279
3280 // Check that the Simulator behaves correctly when all arguments are aliased.
3281 int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
3282 int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
3283 int in_s[] = {0, 1, 1, 0};
3284 int in_d[] = {1, 1};
3285
3286 Initialise(&masm, p0.VnB(), in_b);
3287 Initialise(&masm, p1.VnH(), in_h);
3288 Initialise(&masm, p2.VnS(), in_s);
3289 Initialise(&masm, p3.VnD(), in_d);
3290
3291 // Initialise NZCV to an impossible value, to check that we actually write it.
3292 __ Mov(x10, NZCVFlag);
3293
3294 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003295 __ Pnext(p0.VnB(), p0, p0.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003296 __ Mrs(x0, NZCV);
3297
3298 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003299 __ Pnext(p1.VnB(), p1, p1.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003300 __ Mrs(x1, NZCV);
3301
3302 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003303 __ Pnext(p2.VnB(), p2, p2.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003304 __ Mrs(x2, NZCV);
3305
3306 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003307 __ Pnext(p3.VnB(), p3, p3.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003308 __ Mrs(x3, NZCV);
3309
3310 END();
3311
3312 if (CAN_RUN()) {
3313 RUN();
3314
3315 // Since pg.Is(pdn), there can be no active lanes in pg above the last
3316 // active lane in pdn, so the result should always be zero.
3317 ASSERT_EQUAL_SVE(0, p0.VnB());
3318 ASSERT_EQUAL_SVE(0, p1.VnH());
3319 ASSERT_EQUAL_SVE(0, p2.VnS());
3320 ASSERT_EQUAL_SVE(0, p3.VnD());
3321
3322 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x0);
3323 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x1);
3324 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x2);
3325 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x3);
3326 }
3327}
3328
Jacob Bramleye8289202019-07-31 11:25:23 +01003329static void PtrueHelper(Test* config,
3330 unsigned lane_size_in_bits,
Jacob Bramley0ce75842019-07-17 18:12:50 +01003331 FlagsUpdate s = LeaveFlags) {
Jacob Bramleye8289202019-07-31 11:25:23 +01003332 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003333 START();
3334
3335 PRegisterWithLaneSize p[kNumberOfPRegisters];
3336 for (unsigned i = 0; i < kNumberOfPRegisters; i++) {
3337 p[i] = PRegister(i).WithLaneSize(lane_size_in_bits);
3338 }
3339
3340 // Initialise NZCV to an impossible value, to check that we actually write it.
3341 StatusFlags nzcv_unmodified = NZCVFlag;
3342 __ Mov(x20, nzcv_unmodified);
3343
3344 // We don't have enough registers to conveniently test every pattern, so take
3345 // samples from each group.
3346 __ Msr(NZCV, x20);
3347 __ Ptrue(p[0], SVE_POW2, s);
3348 __ Mrs(x0, NZCV);
3349
3350 __ Msr(NZCV, x20);
3351 __ Ptrue(p[1], SVE_VL1, s);
3352 __ Mrs(x1, NZCV);
3353
3354 __ Msr(NZCV, x20);
3355 __ Ptrue(p[2], SVE_VL2, s);
3356 __ Mrs(x2, NZCV);
3357
3358 __ Msr(NZCV, x20);
3359 __ Ptrue(p[3], SVE_VL5, s);
3360 __ Mrs(x3, NZCV);
3361
3362 __ Msr(NZCV, x20);
3363 __ Ptrue(p[4], SVE_VL6, s);
3364 __ Mrs(x4, NZCV);
3365
3366 __ Msr(NZCV, x20);
3367 __ Ptrue(p[5], SVE_VL8, s);
3368 __ Mrs(x5, NZCV);
3369
3370 __ Msr(NZCV, x20);
3371 __ Ptrue(p[6], SVE_VL16, s);
3372 __ Mrs(x6, NZCV);
3373
3374 __ Msr(NZCV, x20);
3375 __ Ptrue(p[7], SVE_VL64, s);
3376 __ Mrs(x7, NZCV);
3377
3378 __ Msr(NZCV, x20);
3379 __ Ptrue(p[8], SVE_VL256, s);
3380 __ Mrs(x8, NZCV);
3381
3382 {
3383 // We have to use the Assembler to use values not defined by
3384 // SVEPredicateConstraint, so call `ptrues` directly..
3385 typedef void (
3386 MacroAssembler::*AssemblePtrueFn)(const PRegisterWithLaneSize& pd,
3387 int pattern);
3388 AssemblePtrueFn assemble =
3389 (s == SetFlags) ? &MacroAssembler::ptrues : &MacroAssembler::ptrue;
3390
3391 ExactAssemblyScope guard(&masm, 12 * kInstructionSize);
3392 __ msr(NZCV, x20);
3393 (masm.*assemble)(p[9], 0xe);
3394 __ mrs(x9, NZCV);
3395
3396 __ msr(NZCV, x20);
3397 (masm.*assemble)(p[10], 0x16);
3398 __ mrs(x10, NZCV);
3399
3400 __ msr(NZCV, x20);
3401 (masm.*assemble)(p[11], 0x1a);
3402 __ mrs(x11, NZCV);
3403
3404 __ msr(NZCV, x20);
3405 (masm.*assemble)(p[12], 0x1c);
3406 __ mrs(x12, NZCV);
3407 }
3408
3409 __ Msr(NZCV, x20);
3410 __ Ptrue(p[13], SVE_MUL4, s);
3411 __ Mrs(x13, NZCV);
3412
3413 __ Msr(NZCV, x20);
3414 __ Ptrue(p[14], SVE_MUL3, s);
3415 __ Mrs(x14, NZCV);
3416
3417 __ Msr(NZCV, x20);
3418 __ Ptrue(p[15], SVE_ALL, s);
3419 __ Mrs(x15, NZCV);
3420
3421 END();
3422
3423 if (CAN_RUN()) {
3424 RUN();
3425
3426 int all = core.GetSVELaneCount(lane_size_in_bits);
3427 int pow2 = 1 << HighestSetBitPosition(all);
3428 int mul4 = all - (all % 4);
3429 int mul3 = all - (all % 3);
3430
3431 // Check P register results.
3432 for (int i = 0; i < all; i++) {
3433 ASSERT_EQUAL_SVE_LANE(i < pow2, p[0], i);
3434 ASSERT_EQUAL_SVE_LANE((all >= 1) && (i < 1), p[1], i);
3435 ASSERT_EQUAL_SVE_LANE((all >= 2) && (i < 2), p[2], i);
3436 ASSERT_EQUAL_SVE_LANE((all >= 5) && (i < 5), p[3], i);
3437 ASSERT_EQUAL_SVE_LANE((all >= 6) && (i < 6), p[4], i);
3438 ASSERT_EQUAL_SVE_LANE((all >= 8) && (i < 8), p[5], i);
3439 ASSERT_EQUAL_SVE_LANE((all >= 16) && (i < 16), p[6], i);
3440 ASSERT_EQUAL_SVE_LANE((all >= 64) && (i < 64), p[7], i);
3441 ASSERT_EQUAL_SVE_LANE((all >= 256) && (i < 256), p[8], i);
3442 ASSERT_EQUAL_SVE_LANE(false, p[9], i);
3443 ASSERT_EQUAL_SVE_LANE(false, p[10], i);
3444 ASSERT_EQUAL_SVE_LANE(false, p[11], i);
3445 ASSERT_EQUAL_SVE_LANE(false, p[12], i);
3446 ASSERT_EQUAL_SVE_LANE(i < mul4, p[13], i);
3447 ASSERT_EQUAL_SVE_LANE(i < mul3, p[14], i);
3448 ASSERT_EQUAL_SVE_LANE(true, p[15], i);
3449 }
3450
3451 // Check NZCV results.
3452 if (s == LeaveFlags) {
3453 // No flags should have been updated.
3454 for (int i = 0; i <= 15; i++) {
3455 ASSERT_EQUAL_64(nzcv_unmodified, XRegister(i));
3456 }
3457 } else {
3458 StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3459 StatusFlags nonzero = SVEFirstFlag;
3460
3461 // POW2
3462 ASSERT_EQUAL_64(nonzero, x0);
3463 // VL*
3464 ASSERT_EQUAL_64((all >= 1) ? nonzero : zero, x1);
3465 ASSERT_EQUAL_64((all >= 2) ? nonzero : zero, x2);
3466 ASSERT_EQUAL_64((all >= 5) ? nonzero : zero, x3);
3467 ASSERT_EQUAL_64((all >= 6) ? nonzero : zero, x4);
3468 ASSERT_EQUAL_64((all >= 8) ? nonzero : zero, x5);
3469 ASSERT_EQUAL_64((all >= 16) ? nonzero : zero, x6);
3470 ASSERT_EQUAL_64((all >= 64) ? nonzero : zero, x7);
3471 ASSERT_EQUAL_64((all >= 256) ? nonzero : zero, x8);
3472 // #uimm5
3473 ASSERT_EQUAL_64(zero, x9);
3474 ASSERT_EQUAL_64(zero, x10);
3475 ASSERT_EQUAL_64(zero, x11);
3476 ASSERT_EQUAL_64(zero, x12);
3477 // MUL*
3478 ASSERT_EQUAL_64((all >= 4) ? nonzero : zero, x13);
3479 ASSERT_EQUAL_64((all >= 3) ? nonzero : zero, x14);
3480 // ALL
3481 ASSERT_EQUAL_64(nonzero, x15);
3482 }
3483 }
3484}
3485
Jacob Bramleye8289202019-07-31 11:25:23 +01003486TEST_SVE(sve_ptrue_b) { PtrueHelper(config, kBRegSize, LeaveFlags); }
3487TEST_SVE(sve_ptrue_h) { PtrueHelper(config, kHRegSize, LeaveFlags); }
3488TEST_SVE(sve_ptrue_s) { PtrueHelper(config, kSRegSize, LeaveFlags); }
3489TEST_SVE(sve_ptrue_d) { PtrueHelper(config, kDRegSize, LeaveFlags); }
Jacob Bramley0ce75842019-07-17 18:12:50 +01003490
Jacob Bramleye8289202019-07-31 11:25:23 +01003491TEST_SVE(sve_ptrues_b) { PtrueHelper(config, kBRegSize, SetFlags); }
3492TEST_SVE(sve_ptrues_h) { PtrueHelper(config, kHRegSize, SetFlags); }
3493TEST_SVE(sve_ptrues_s) { PtrueHelper(config, kSRegSize, SetFlags); }
3494TEST_SVE(sve_ptrues_d) { PtrueHelper(config, kDRegSize, SetFlags); }
Jacob Bramley0ce75842019-07-17 18:12:50 +01003495
Jacob Bramleye8289202019-07-31 11:25:23 +01003496TEST_SVE(sve_pfalse) {
3497 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003498 START();
3499
3500 // Initialise non-zero inputs.
3501 __ Ptrue(p0.VnB());
3502 __ Ptrue(p1.VnH());
3503 __ Ptrue(p2.VnS());
3504 __ Ptrue(p3.VnD());
3505
3506 // The instruction only supports B-sized lanes, but the lane size has no
3507 // logical effect, so the MacroAssembler accepts anything.
3508 __ Pfalse(p0.VnB());
3509 __ Pfalse(p1.VnH());
3510 __ Pfalse(p2.VnS());
3511 __ Pfalse(p3.VnD());
3512
3513 END();
3514
3515 if (CAN_RUN()) {
3516 RUN();
3517
3518 ASSERT_EQUAL_SVE(0, p0.VnB());
3519 ASSERT_EQUAL_SVE(0, p1.VnB());
3520 ASSERT_EQUAL_SVE(0, p2.VnB());
3521 ASSERT_EQUAL_SVE(0, p3.VnB());
3522 }
3523}
3524
Jacob Bramleye8289202019-07-31 11:25:23 +01003525TEST_SVE(sve_ptest) {
3526 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003527 START();
3528
3529 // Initialise NZCV to a known (impossible) value.
3530 StatusFlags nzcv_unmodified = NZCVFlag;
3531 __ Mov(x0, nzcv_unmodified);
3532 __ Msr(NZCV, x0);
3533
3534 // Construct some test inputs.
3535 int in2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0};
3536 int in3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0};
3537 int in4[] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0};
3538 __ Pfalse(p0.VnB());
3539 __ Ptrue(p1.VnB());
3540 Initialise(&masm, p2.VnB(), in2);
3541 Initialise(&masm, p3.VnB(), in3);
3542 Initialise(&masm, p4.VnB(), in4);
3543
3544 // All-inactive pg.
3545 __ Ptest(p0, p0.VnB());
3546 __ Mrs(x0, NZCV);
3547 __ Ptest(p0, p1.VnB());
3548 __ Mrs(x1, NZCV);
3549 __ Ptest(p0, p2.VnB());
3550 __ Mrs(x2, NZCV);
3551 __ Ptest(p0, p3.VnB());
3552 __ Mrs(x3, NZCV);
3553 __ Ptest(p0, p4.VnB());
3554 __ Mrs(x4, NZCV);
3555
3556 // All-active pg.
3557 __ Ptest(p1, p0.VnB());
3558 __ Mrs(x5, NZCV);
3559 __ Ptest(p1, p1.VnB());
3560 __ Mrs(x6, NZCV);
3561 __ Ptest(p1, p2.VnB());
3562 __ Mrs(x7, NZCV);
3563 __ Ptest(p1, p3.VnB());
3564 __ Mrs(x8, NZCV);
3565 __ Ptest(p1, p4.VnB());
3566 __ Mrs(x9, NZCV);
3567
3568 // Combinations of other inputs.
3569 __ Ptest(p2, p2.VnB());
3570 __ Mrs(x20, NZCV);
3571 __ Ptest(p2, p3.VnB());
3572 __ Mrs(x21, NZCV);
3573 __ Ptest(p2, p4.VnB());
3574 __ Mrs(x22, NZCV);
3575 __ Ptest(p3, p2.VnB());
3576 __ Mrs(x23, NZCV);
3577 __ Ptest(p3, p3.VnB());
3578 __ Mrs(x24, NZCV);
3579 __ Ptest(p3, p4.VnB());
3580 __ Mrs(x25, NZCV);
3581 __ Ptest(p4, p2.VnB());
3582 __ Mrs(x26, NZCV);
3583 __ Ptest(p4, p3.VnB());
3584 __ Mrs(x27, NZCV);
3585 __ Ptest(p4, p4.VnB());
3586 __ Mrs(x28, NZCV);
3587
3588 END();
3589
3590 if (CAN_RUN()) {
3591 RUN();
3592
3593 StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3594
3595 // If pg is all inactive, the value of pn is irrelevant.
3596 ASSERT_EQUAL_64(zero, x0);
3597 ASSERT_EQUAL_64(zero, x1);
3598 ASSERT_EQUAL_64(zero, x2);
3599 ASSERT_EQUAL_64(zero, x3);
3600 ASSERT_EQUAL_64(zero, x4);
3601
3602 // All-active pg.
3603 ASSERT_EQUAL_64(zero, x5); // All-inactive pn.
3604 ASSERT_EQUAL_64(SVEFirstFlag, x6); // All-active pn.
3605 // Other pn inputs are non-zero, but the first and last lanes are inactive.
3606 ASSERT_EQUAL_64(SVENotLastFlag, x7);
3607 ASSERT_EQUAL_64(SVENotLastFlag, x8);
3608 ASSERT_EQUAL_64(SVENotLastFlag, x9);
3609
3610 // Other inputs.
3611 ASSERT_EQUAL_64(SVEFirstFlag, x20); // pg: in2, pn: in2
3612 ASSERT_EQUAL_64(NoFlag, x21); // pg: in2, pn: in3
3613 ASSERT_EQUAL_64(zero, x22); // pg: in2, pn: in4
3614 ASSERT_EQUAL_64(static_cast<StatusFlags>(SVEFirstFlag | SVENotLastFlag),
3615 x23); // pg: in3, pn: in2
3616 ASSERT_EQUAL_64(SVEFirstFlag, x24); // pg: in3, pn: in3
3617 ASSERT_EQUAL_64(zero, x25); // pg: in3, pn: in4
3618 ASSERT_EQUAL_64(zero, x26); // pg: in4, pn: in2
3619 ASSERT_EQUAL_64(zero, x27); // pg: in4, pn: in3
3620 ASSERT_EQUAL_64(SVEFirstFlag, x28); // pg: in4, pn: in4
3621 }
3622}
3623
Jacob Bramleye8289202019-07-31 11:25:23 +01003624TEST_SVE(sve_cntp) {
3625 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd961a0c2019-07-17 10:53:45 +01003626 START();
3627
3628 // There are {7, 5, 2, 1} active {B, H, S, D} lanes.
3629 int p0_inputs[] = {0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
3630 Initialise(&masm, p0.VnB(), p0_inputs);
3631
3632 // With an all-true predicate, these instructions measure the vector length.
3633 __ Ptrue(p10.VnB());
3634 __ Ptrue(p11.VnH());
3635 __ Ptrue(p12.VnS());
3636 __ Ptrue(p13.VnD());
3637
3638 // `ptrue p10.b` provides an all-active pg.
3639 __ Cntp(x10, p10, p10.VnB());
3640 __ Cntp(x11, p10, p11.VnH());
3641 __ Cntp(x12, p10, p12.VnS());
3642 __ Cntp(x13, p10, p13.VnD());
3643
3644 // Check that the predicate mask is applied properly.
3645 __ Cntp(x14, p10, p10.VnB());
3646 __ Cntp(x15, p11, p10.VnB());
3647 __ Cntp(x16, p12, p10.VnB());
3648 __ Cntp(x17, p13, p10.VnB());
3649
3650 // Check other patterns (including some ignored bits).
3651 __ Cntp(x0, p10, p0.VnB());
3652 __ Cntp(x1, p10, p0.VnH());
3653 __ Cntp(x2, p10, p0.VnS());
3654 __ Cntp(x3, p10, p0.VnD());
3655 __ Cntp(x4, p0, p10.VnB());
3656 __ Cntp(x5, p0, p10.VnH());
3657 __ Cntp(x6, p0, p10.VnS());
3658 __ Cntp(x7, p0, p10.VnD());
3659
3660 END();
3661
3662 if (CAN_RUN()) {
3663 RUN();
3664
3665 int vl_b = core.GetSVELaneCount(kBRegSize);
3666 int vl_h = core.GetSVELaneCount(kHRegSize);
3667 int vl_s = core.GetSVELaneCount(kSRegSize);
3668 int vl_d = core.GetSVELaneCount(kDRegSize);
3669
3670 // Check all-active predicates in various combinations.
3671 ASSERT_EQUAL_64(vl_b, x10);
3672 ASSERT_EQUAL_64(vl_h, x11);
3673 ASSERT_EQUAL_64(vl_s, x12);
3674 ASSERT_EQUAL_64(vl_d, x13);
3675
3676 ASSERT_EQUAL_64(vl_b, x14);
3677 ASSERT_EQUAL_64(vl_h, x15);
3678 ASSERT_EQUAL_64(vl_s, x16);
3679 ASSERT_EQUAL_64(vl_d, x17);
3680
3681 // Check that irrelevant bits are properly ignored.
3682 ASSERT_EQUAL_64(7, x0);
3683 ASSERT_EQUAL_64(5, x1);
3684 ASSERT_EQUAL_64(2, x2);
3685 ASSERT_EQUAL_64(1, x3);
3686
3687 ASSERT_EQUAL_64(7, x4);
3688 ASSERT_EQUAL_64(5, x5);
3689 ASSERT_EQUAL_64(2, x6);
3690 ASSERT_EQUAL_64(1, x7);
3691 }
3692}
3693
Martyn Capewell74f84f62019-10-30 15:30:44 +00003694typedef void (MacroAssembler::*CntFn)(const Register& dst,
3695 int pattern,
3696 int multiplier);
3697
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003698template <typename T>
3699void GenerateCntSequence(MacroAssembler* masm,
3700 CntFn cnt,
3701 T acc_value,
3702 int multiplier) {
3703 // Initialise accumulators.
3704 masm->Mov(x0, acc_value);
3705 masm->Mov(x1, acc_value);
3706 masm->Mov(x2, acc_value);
3707 masm->Mov(x3, acc_value);
3708 masm->Mov(x4, acc_value);
3709 masm->Mov(x5, acc_value);
3710 masm->Mov(x6, acc_value);
3711 masm->Mov(x7, acc_value);
3712 masm->Mov(x8, acc_value);
3713 masm->Mov(x9, acc_value);
3714 masm->Mov(x10, acc_value);
3715 masm->Mov(x11, acc_value);
3716 masm->Mov(x12, acc_value);
3717 masm->Mov(x13, acc_value);
3718 masm->Mov(x14, acc_value);
3719 masm->Mov(x15, acc_value);
3720 masm->Mov(x18, acc_value);
3721 masm->Mov(x19, acc_value);
3722 masm->Mov(x20, acc_value);
3723 masm->Mov(x21, acc_value);
3724
3725 (masm->*cnt)(Register(0, sizeof(T) * kBitsPerByte), SVE_POW2, multiplier);
3726 (masm->*cnt)(Register(1, sizeof(T) * kBitsPerByte), SVE_VL1, multiplier);
3727 (masm->*cnt)(Register(2, sizeof(T) * kBitsPerByte), SVE_VL2, multiplier);
3728 (masm->*cnt)(Register(3, sizeof(T) * kBitsPerByte), SVE_VL3, multiplier);
3729 (masm->*cnt)(Register(4, sizeof(T) * kBitsPerByte), SVE_VL4, multiplier);
3730 (masm->*cnt)(Register(5, sizeof(T) * kBitsPerByte), SVE_VL5, multiplier);
3731 (masm->*cnt)(Register(6, sizeof(T) * kBitsPerByte), SVE_VL6, multiplier);
3732 (masm->*cnt)(Register(7, sizeof(T) * kBitsPerByte), SVE_VL7, multiplier);
3733 (masm->*cnt)(Register(8, sizeof(T) * kBitsPerByte), SVE_VL8, multiplier);
3734 (masm->*cnt)(Register(9, sizeof(T) * kBitsPerByte), SVE_VL16, multiplier);
3735 (masm->*cnt)(Register(10, sizeof(T) * kBitsPerByte), SVE_VL32, multiplier);
3736 (masm->*cnt)(Register(11, sizeof(T) * kBitsPerByte), SVE_VL64, multiplier);
3737 (masm->*cnt)(Register(12, sizeof(T) * kBitsPerByte), SVE_VL128, multiplier);
3738 (masm->*cnt)(Register(13, sizeof(T) * kBitsPerByte), SVE_VL256, multiplier);
3739 (masm->*cnt)(Register(14, sizeof(T) * kBitsPerByte), 16, multiplier);
3740 (masm->*cnt)(Register(15, sizeof(T) * kBitsPerByte), 23, multiplier);
3741 (masm->*cnt)(Register(18, sizeof(T) * kBitsPerByte), 28, multiplier);
3742 (masm->*cnt)(Register(19, sizeof(T) * kBitsPerByte), SVE_MUL4, multiplier);
3743 (masm->*cnt)(Register(20, sizeof(T) * kBitsPerByte), SVE_MUL3, multiplier);
3744 (masm->*cnt)(Register(21, sizeof(T) * kBitsPerByte), SVE_ALL, multiplier);
3745}
3746
3747int FixedVL(int fixed, int length) {
3748 VIXL_ASSERT(((fixed >= 1) && (fixed <= 8)) || (fixed == 16) ||
3749 (fixed == 32) || (fixed == 64) || (fixed == 128) ||
3750 (fixed = 256));
3751 return (length >= fixed) ? fixed : 0;
3752}
3753
Martyn Capewell74f84f62019-10-30 15:30:44 +00003754static void CntHelper(Test* config,
3755 CntFn cnt,
3756 int multiplier,
Martyn Capewell579c92d2019-10-30 17:48:52 +00003757 int lane_size_in_bits,
3758 int64_t acc_value = 0,
3759 bool is_increment = true) {
Martyn Capewell74f84f62019-10-30 15:30:44 +00003760 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3761 START();
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003762 GenerateCntSequence(&masm, cnt, acc_value, multiplier);
Martyn Capewell74f84f62019-10-30 15:30:44 +00003763 END();
3764
3765 if (CAN_RUN()) {
3766 RUN();
3767
3768 int all = core.GetSVELaneCount(lane_size_in_bits);
3769 int pow2 = 1 << HighestSetBitPosition(all);
3770 int mul4 = all - (all % 4);
3771 int mul3 = all - (all % 3);
3772
Martyn Capewell579c92d2019-10-30 17:48:52 +00003773 multiplier = is_increment ? multiplier : -multiplier;
3774
3775 ASSERT_EQUAL_64(acc_value + (multiplier * pow2), x0);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003776 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(1, all)), x1);
3777 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(2, all)), x2);
3778 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(3, all)), x3);
3779 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(4, all)), x4);
3780 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(5, all)), x5);
3781 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(6, all)), x6);
3782 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(7, all)), x7);
3783 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(8, all)), x8);
3784 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(16, all)), x9);
3785 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(32, all)), x10);
3786 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(64, all)), x11);
3787 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(128, all)), x12);
3788 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(256, all)), x13);
Martyn Capewell579c92d2019-10-30 17:48:52 +00003789 ASSERT_EQUAL_64(acc_value, x14);
3790 ASSERT_EQUAL_64(acc_value, x15);
3791 ASSERT_EQUAL_64(acc_value, x18);
3792 ASSERT_EQUAL_64(acc_value + (multiplier * mul4), x19);
3793 ASSERT_EQUAL_64(acc_value + (multiplier * mul3), x20);
3794 ASSERT_EQUAL_64(acc_value + (multiplier * all), x21);
Martyn Capewell74f84f62019-10-30 15:30:44 +00003795 }
3796}
3797
Martyn Capewell579c92d2019-10-30 17:48:52 +00003798static void IncHelper(Test* config,
3799 CntFn cnt,
3800 int multiplier,
3801 int lane_size_in_bits,
3802 int64_t acc_value) {
3803 CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
3804}
3805
3806static void DecHelper(Test* config,
3807 CntFn cnt,
3808 int multiplier,
3809 int lane_size_in_bits,
3810 int64_t acc_value) {
3811 CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
3812}
3813
Martyn Capewell74f84f62019-10-30 15:30:44 +00003814TEST_SVE(sve_cntb) {
3815 CntHelper(config, &MacroAssembler::Cntb, 1, kBRegSize);
3816 CntHelper(config, &MacroAssembler::Cntb, 2, kBRegSize);
3817 CntHelper(config, &MacroAssembler::Cntb, 15, kBRegSize);
3818 CntHelper(config, &MacroAssembler::Cntb, 16, kBRegSize);
3819}
3820
3821TEST_SVE(sve_cnth) {
3822 CntHelper(config, &MacroAssembler::Cnth, 1, kHRegSize);
3823 CntHelper(config, &MacroAssembler::Cnth, 2, kHRegSize);
3824 CntHelper(config, &MacroAssembler::Cnth, 15, kHRegSize);
3825 CntHelper(config, &MacroAssembler::Cnth, 16, kHRegSize);
3826}
3827
3828TEST_SVE(sve_cntw) {
3829 CntHelper(config, &MacroAssembler::Cntw, 1, kWRegSize);
3830 CntHelper(config, &MacroAssembler::Cntw, 2, kWRegSize);
3831 CntHelper(config, &MacroAssembler::Cntw, 15, kWRegSize);
3832 CntHelper(config, &MacroAssembler::Cntw, 16, kWRegSize);
3833}
3834
3835TEST_SVE(sve_cntd) {
3836 CntHelper(config, &MacroAssembler::Cntd, 1, kDRegSize);
3837 CntHelper(config, &MacroAssembler::Cntd, 2, kDRegSize);
3838 CntHelper(config, &MacroAssembler::Cntd, 15, kDRegSize);
3839 CntHelper(config, &MacroAssembler::Cntd, 16, kDRegSize);
3840}
3841
Martyn Capewell579c92d2019-10-30 17:48:52 +00003842TEST_SVE(sve_decb) {
3843 DecHelper(config, &MacroAssembler::Decb, 1, kBRegSize, 42);
3844 DecHelper(config, &MacroAssembler::Decb, 2, kBRegSize, -1);
3845 DecHelper(config, &MacroAssembler::Decb, 15, kBRegSize, INT64_MIN);
3846 DecHelper(config, &MacroAssembler::Decb, 16, kBRegSize, -42);
3847}
3848
3849TEST_SVE(sve_dech) {
3850 DecHelper(config, &MacroAssembler::Dech, 1, kHRegSize, 42);
3851 DecHelper(config, &MacroAssembler::Dech, 2, kHRegSize, -1);
3852 DecHelper(config, &MacroAssembler::Dech, 15, kHRegSize, INT64_MIN);
3853 DecHelper(config, &MacroAssembler::Dech, 16, kHRegSize, -42);
3854}
3855
3856TEST_SVE(sve_decw) {
3857 DecHelper(config, &MacroAssembler::Decw, 1, kWRegSize, 42);
3858 DecHelper(config, &MacroAssembler::Decw, 2, kWRegSize, -1);
3859 DecHelper(config, &MacroAssembler::Decw, 15, kWRegSize, INT64_MIN);
3860 DecHelper(config, &MacroAssembler::Decw, 16, kWRegSize, -42);
3861}
3862
3863TEST_SVE(sve_decd) {
3864 DecHelper(config, &MacroAssembler::Decd, 1, kDRegSize, 42);
3865 DecHelper(config, &MacroAssembler::Decd, 2, kDRegSize, -1);
3866 DecHelper(config, &MacroAssembler::Decd, 15, kDRegSize, INT64_MIN);
3867 DecHelper(config, &MacroAssembler::Decd, 16, kDRegSize, -42);
3868}
3869
3870TEST_SVE(sve_incb) {
3871 IncHelper(config, &MacroAssembler::Incb, 1, kBRegSize, 42);
3872 IncHelper(config, &MacroAssembler::Incb, 2, kBRegSize, -1);
3873 IncHelper(config, &MacroAssembler::Incb, 15, kBRegSize, INT64_MAX);
3874 IncHelper(config, &MacroAssembler::Incb, 16, kBRegSize, -42);
3875}
3876
3877TEST_SVE(sve_inch) {
3878 IncHelper(config, &MacroAssembler::Inch, 1, kHRegSize, 42);
3879 IncHelper(config, &MacroAssembler::Inch, 2, kHRegSize, -1);
3880 IncHelper(config, &MacroAssembler::Inch, 15, kHRegSize, INT64_MAX);
3881 IncHelper(config, &MacroAssembler::Inch, 16, kHRegSize, -42);
3882}
3883
3884TEST_SVE(sve_incw) {
3885 IncHelper(config, &MacroAssembler::Incw, 1, kWRegSize, 42);
3886 IncHelper(config, &MacroAssembler::Incw, 2, kWRegSize, -1);
3887 IncHelper(config, &MacroAssembler::Incw, 15, kWRegSize, INT64_MAX);
3888 IncHelper(config, &MacroAssembler::Incw, 16, kWRegSize, -42);
3889}
3890
3891TEST_SVE(sve_incd) {
3892 IncHelper(config, &MacroAssembler::Incd, 1, kDRegSize, 42);
3893 IncHelper(config, &MacroAssembler::Incd, 2, kDRegSize, -1);
3894 IncHelper(config, &MacroAssembler::Incd, 15, kDRegSize, INT64_MAX);
3895 IncHelper(config, &MacroAssembler::Incd, 16, kDRegSize, -42);
3896}
3897
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003898template <typename T>
3899static T QAdd(T x, int y) {
3900 VIXL_ASSERT(y > INT_MIN);
3901 T result;
3902 T min = std::numeric_limits<T>::min();
3903 T max = std::numeric_limits<T>::max();
3904 if ((x >= 0) && (y >= 0)) {
3905 // For positive a and b, saturate at max.
3906 result = (max - x) < static_cast<T>(y) ? max : x + y;
3907 } else if ((y < 0) && ((x < 0) || (min == 0))) {
3908 // For negative b, where either a negative or T unsigned.
3909 result = (x - min) < static_cast<T>(-y) ? min : x + y;
3910 } else {
3911 result = x + y;
3912 }
3913 return result;
3914}
3915
3916template <typename T>
3917static void QIncDecHelper(Test* config,
3918 CntFn cnt,
3919 int multiplier,
3920 int lane_size_in_bits,
3921 T acc_value,
3922 bool is_increment) {
3923 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3924 START();
3925 GenerateCntSequence(&masm, cnt, acc_value, multiplier);
3926 END();
3927
3928 if (CAN_RUN()) {
3929 RUN();
3930
3931 int all = core.GetSVELaneCount(lane_size_in_bits);
3932 int pow2 = 1 << HighestSetBitPosition(all);
3933 int mul4 = all - (all % 4);
3934 int mul3 = all - (all % 3);
3935
3936 multiplier = is_increment ? multiplier : -multiplier;
3937
3938 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
3939 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
3940 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
3941 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
3942 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
3943 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
3944 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
3945 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
3946 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
3947 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
3948 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
3949 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
3950 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
3951 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
3952 ASSERT_EQUAL_64(acc_value, x14);
3953 ASSERT_EQUAL_64(acc_value, x15);
3954 ASSERT_EQUAL_64(acc_value, x18);
3955 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
3956 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
3957 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
3958 }
3959}
3960
3961template <typename T>
3962static void QIncHelper(Test* config,
3963 CntFn cnt,
3964 int multiplier,
3965 int lane_size_in_bits,
3966 T acc_value) {
3967 QIncDecHelper<T>(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
3968}
3969
3970template <typename T>
3971static void QDecHelper(Test* config,
3972 CntFn cnt,
3973 int multiplier,
3974 int lane_size_in_bits,
3975 T acc_value) {
3976 QIncDecHelper<T>(config,
3977 cnt,
3978 multiplier,
3979 lane_size_in_bits,
3980 acc_value,
3981 false);
3982}
3983
3984TEST_SVE(sve_sqdecb) {
3985 int64_t bigneg = INT64_MIN + 42;
3986 int64_t bigpos = INT64_MAX - 42;
3987 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
3988 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 2, kBRegSize, bigneg);
3989 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
3990 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 16, kBRegSize, bigpos);
3991}
3992
3993TEST_SVE(sve_sqdech) {
3994 int64_t bigneg = INT64_MIN + 42;
3995 int64_t bigpos = INT64_MAX - 42;
3996 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
3997 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 2, kHRegSize, bigneg);
3998 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
3999 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 16, kHRegSize, bigpos);
4000}
4001
4002TEST_SVE(sve_sqdecw) {
4003 int64_t bigneg = INT64_MIN + 42;
4004 int64_t bigpos = INT64_MAX - 42;
4005 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4006 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 2, kWRegSize, bigneg);
4007 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4008 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 16, kWRegSize, bigpos);
4009}
4010
4011TEST_SVE(sve_sqdecd) {
4012 int64_t bigneg = INT64_MIN + 42;
4013 int64_t bigpos = INT64_MAX - 42;
4014 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4015 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 2, kDRegSize, bigneg);
4016 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4017 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 16, kDRegSize, bigpos);
4018}
4019
4020TEST_SVE(sve_sqincb) {
4021 int64_t bigneg = INT64_MIN + 42;
4022 int64_t bigpos = INT64_MAX - 42;
4023 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4024 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 2, kBRegSize, bigneg);
4025 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4026 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 16, kBRegSize, bigpos);
4027}
4028
4029TEST_SVE(sve_sqinch) {
4030 int64_t bigneg = INT64_MIN + 42;
4031 int64_t bigpos = INT64_MAX - 42;
4032 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4033 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 2, kHRegSize, bigneg);
4034 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4035 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 16, kHRegSize, bigpos);
4036}
4037
4038TEST_SVE(sve_sqincw) {
4039 int64_t bigneg = INT64_MIN + 42;
4040 int64_t bigpos = INT64_MAX - 42;
4041 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4042 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 2, kWRegSize, bigneg);
4043 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4044 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 16, kWRegSize, bigpos);
4045}
4046
4047TEST_SVE(sve_sqincd) {
4048 int64_t bigneg = INT64_MIN + 42;
4049 int64_t bigpos = INT64_MAX - 42;
4050 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4051 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 2, kDRegSize, bigneg);
4052 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4053 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 16, kDRegSize, bigpos);
4054}
4055
4056TEST_SVE(sve_uqdecb) {
4057 int32_t big32 = UINT32_MAX - 42;
4058 int64_t big64 = UINT64_MAX - 42;
4059 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4060 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4061 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4062 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big32);
4063 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4064 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4065 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4066 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big64);
4067}
4068
4069TEST_SVE(sve_uqdech) {
4070 int32_t big32 = UINT32_MAX - 42;
4071 int64_t big64 = UINT64_MAX - 42;
4072 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4073 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4074 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4075 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big32);
4076 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4077 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4078 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4079 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big64);
4080}
4081
4082TEST_SVE(sve_uqdecw) {
4083 int32_t big32 = UINT32_MAX - 42;
4084 int64_t big64 = UINT64_MAX - 42;
4085 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4086 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4087 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4088 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big32);
4089 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4090 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4091 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4092 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big64);
4093}
4094
4095TEST_SVE(sve_uqdecd) {
4096 int32_t big32 = UINT32_MAX - 42;
4097 int64_t big64 = UINT64_MAX - 42;
4098 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4099 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4100 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4101 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big32);
4102 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4103 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4104 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4105 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big64);
4106}
4107
4108TEST_SVE(sve_uqincb) {
4109 int32_t big32 = UINT32_MAX - 42;
4110 int64_t big64 = UINT64_MAX - 42;
4111 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4112 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4113 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4114 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big32);
4115 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4116 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4117 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4118 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big64);
4119}
4120
4121TEST_SVE(sve_uqinch) {
4122 int32_t big32 = UINT32_MAX - 42;
4123 int64_t big64 = UINT64_MAX - 42;
4124 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4125 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4126 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4127 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big32);
4128 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4129 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4130 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4131 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big64);
4132}
4133
4134TEST_SVE(sve_uqincw) {
4135 int32_t big32 = UINT32_MAX - 42;
4136 int64_t big64 = UINT64_MAX - 42;
4137 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4138 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4139 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4140 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big32);
4141 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4142 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4143 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4144 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big64);
4145}
4146
4147TEST_SVE(sve_uqincd) {
4148 int32_t big32 = UINT32_MAX - 42;
4149 int64_t big64 = UINT64_MAX - 42;
4150 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4151 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4152 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4153 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big32);
4154 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4155 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4156 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4157 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big64);
4158}
4159
4160typedef void (MacroAssembler::*QIncDecXWFn)(const Register& dst,
4161 const Register& src,
4162 int pattern,
4163 int multiplier);
4164
4165static void QIncDecXWHelper(Test* config,
4166 QIncDecXWFn cnt,
4167 int multiplier,
4168 int lane_size_in_bits,
4169 int32_t acc_value,
4170 bool is_increment) {
4171 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4172 START();
4173
4174 // Initialise accumulators.
4175 __ Mov(x0, acc_value);
4176 __ Mov(x1, acc_value);
4177 __ Mov(x2, acc_value);
4178 __ Mov(x3, acc_value);
4179 __ Mov(x4, acc_value);
4180 __ Mov(x5, acc_value);
4181 __ Mov(x6, acc_value);
4182 __ Mov(x7, acc_value);
4183 __ Mov(x8, acc_value);
4184 __ Mov(x9, acc_value);
4185 __ Mov(x10, acc_value);
4186 __ Mov(x11, acc_value);
4187 __ Mov(x12, acc_value);
4188 __ Mov(x13, acc_value);
4189 __ Mov(x14, acc_value);
4190 __ Mov(x15, acc_value);
4191 __ Mov(x18, acc_value);
4192 __ Mov(x19, acc_value);
4193 __ Mov(x20, acc_value);
4194 __ Mov(x21, acc_value);
4195
4196 (masm.*cnt)(x0, w0, SVE_POW2, multiplier);
4197 (masm.*cnt)(x1, w1, SVE_VL1, multiplier);
4198 (masm.*cnt)(x2, w2, SVE_VL2, multiplier);
4199 (masm.*cnt)(x3, w3, SVE_VL3, multiplier);
4200 (masm.*cnt)(x4, w4, SVE_VL4, multiplier);
4201 (masm.*cnt)(x5, w5, SVE_VL5, multiplier);
4202 (masm.*cnt)(x6, w6, SVE_VL6, multiplier);
4203 (masm.*cnt)(x7, w7, SVE_VL7, multiplier);
4204 (masm.*cnt)(x8, w8, SVE_VL8, multiplier);
4205 (masm.*cnt)(x9, w9, SVE_VL16, multiplier);
4206 (masm.*cnt)(x10, w10, SVE_VL32, multiplier);
4207 (masm.*cnt)(x11, w11, SVE_VL64, multiplier);
4208 (masm.*cnt)(x12, w12, SVE_VL128, multiplier);
4209 (masm.*cnt)(x13, w13, SVE_VL256, multiplier);
4210 (masm.*cnt)(x14, w14, 16, multiplier);
4211 (masm.*cnt)(x15, w15, 23, multiplier);
4212 (masm.*cnt)(x18, w18, 28, multiplier);
4213 (masm.*cnt)(x19, w19, SVE_MUL4, multiplier);
4214 (masm.*cnt)(x20, w20, SVE_MUL3, multiplier);
4215 (masm.*cnt)(x21, w21, SVE_ALL, multiplier);
4216
4217 END();
4218
4219 if (CAN_RUN()) {
4220 RUN();
4221
4222 int all = core.GetSVELaneCount(lane_size_in_bits);
4223 int pow2 = 1 << HighestSetBitPosition(all);
4224 int mul4 = all - (all % 4);
4225 int mul3 = all - (all % 3);
4226
4227 multiplier = is_increment ? multiplier : -multiplier;
4228
4229 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
4230 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
4231 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
4232 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
4233 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
4234 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
4235 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
4236 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
4237 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
4238 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
4239 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
4240 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
4241 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
4242 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
4243 ASSERT_EQUAL_64(acc_value, x14);
4244 ASSERT_EQUAL_64(acc_value, x15);
4245 ASSERT_EQUAL_64(acc_value, x18);
4246 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
4247 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
4248 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
4249 }
4250}
4251
4252static void QIncXWHelper(Test* config,
4253 QIncDecXWFn cnt,
4254 int multiplier,
4255 int lane_size_in_bits,
4256 int32_t acc_value) {
4257 QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
4258}
4259
4260static void QDecXWHelper(Test* config,
4261 QIncDecXWFn cnt,
4262 int multiplier,
4263 int lane_size_in_bits,
4264 int32_t acc_value) {
4265 QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
4266}
4267
4268TEST_SVE(sve_sqdecb_xw) {
4269 QDecXWHelper(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
4270 QDecXWHelper(config, &MacroAssembler::Sqdecb, 2, kBRegSize, INT32_MIN + 42);
4271 QDecXWHelper(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
4272 QDecXWHelper(config, &MacroAssembler::Sqdecb, 16, kBRegSize, INT32_MAX - 42);
4273}
4274
4275TEST_SVE(sve_sqdech_xw) {
4276 QDecXWHelper(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
4277 QDecXWHelper(config, &MacroAssembler::Sqdech, 2, kHRegSize, INT32_MIN + 42);
4278 QDecXWHelper(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
4279 QDecXWHelper(config, &MacroAssembler::Sqdech, 16, kHRegSize, INT32_MAX - 42);
4280}
4281
4282TEST_SVE(sve_sqdecw_xw) {
4283 QDecXWHelper(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4284 QDecXWHelper(config, &MacroAssembler::Sqdecw, 2, kWRegSize, INT32_MIN + 42);
4285 QDecXWHelper(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4286 QDecXWHelper(config, &MacroAssembler::Sqdecw, 16, kWRegSize, INT32_MAX - 42);
4287}
4288
4289TEST_SVE(sve_sqdecd_xw) {
4290 QDecXWHelper(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4291 QDecXWHelper(config, &MacroAssembler::Sqdecd, 2, kDRegSize, INT32_MIN + 42);
4292 QDecXWHelper(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4293 QDecXWHelper(config, &MacroAssembler::Sqdecd, 16, kDRegSize, INT32_MAX - 42);
4294}
4295
4296TEST_SVE(sve_sqincb_xw) {
4297 QIncXWHelper(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4298 QIncXWHelper(config, &MacroAssembler::Sqincb, 2, kBRegSize, INT32_MIN + 42);
4299 QIncXWHelper(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4300 QIncXWHelper(config, &MacroAssembler::Sqincb, 16, kBRegSize, INT32_MAX - 42);
4301}
4302
4303TEST_SVE(sve_sqinch_xw) {
4304 QIncXWHelper(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4305 QIncXWHelper(config, &MacroAssembler::Sqinch, 2, kHRegSize, INT32_MIN + 42);
4306 QIncXWHelper(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4307 QIncXWHelper(config, &MacroAssembler::Sqinch, 16, kHRegSize, INT32_MAX - 42);
4308}
4309
4310TEST_SVE(sve_sqincw_xw) {
4311 QIncXWHelper(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4312 QIncXWHelper(config, &MacroAssembler::Sqincw, 2, kWRegSize, INT32_MIN + 42);
4313 QIncXWHelper(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4314 QIncXWHelper(config, &MacroAssembler::Sqincw, 16, kWRegSize, INT32_MAX - 42);
4315}
4316
4317TEST_SVE(sve_sqincd_xw) {
4318 QIncXWHelper(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4319 QIncXWHelper(config, &MacroAssembler::Sqincd, 2, kDRegSize, INT32_MIN + 42);
4320 QIncXWHelper(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4321 QIncXWHelper(config, &MacroAssembler::Sqincd, 16, kDRegSize, INT32_MAX - 42);
4322}
4323
Martyn Capewell8188ddf2019-11-21 17:09:34 +00004324typedef void (MacroAssembler::*IncDecZFn)(const ZRegister& dst,
4325 int pattern,
4326 int multiplier);
4327typedef void (MacroAssembler::*AddSubFn)(const ZRegister& dst,
4328 const ZRegister& src1,
4329 const ZRegister& src2);
4330
4331static void IncDecZHelper(Test* config,
4332 IncDecZFn fn,
4333 CntFn cnt,
4334 AddSubFn addsub,
4335 int multiplier,
4336 int lane_size_in_bits) {
4337 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4338 START();
4339
4340 uint64_t acc_inputs[] = {0x7766554433221100,
4341 0xffffffffffffffff,
4342 0x0000000000000000,
4343 0xffffffff0000ffff,
4344 0x7fffffffffffffff,
4345 0x8000000000000000,
4346 0x7fffffff7fff7fff,
4347 0x8000000080008000};
4348
4349 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
4350 for (int j = 0; j < 4; j++) {
4351 InsrHelper(&masm, ZRegister(i, kDRegSize), acc_inputs);
4352 }
4353 }
4354 for (unsigned i = 0; i < 15; i++) {
4355 __ Mov(XRegister(i), 0);
4356 }
4357
4358 (masm.*fn)(z16.WithLaneSize(lane_size_in_bits), SVE_POW2, multiplier);
4359 (masm.*fn)(z17.WithLaneSize(lane_size_in_bits), SVE_VL1, multiplier);
4360 (masm.*fn)(z18.WithLaneSize(lane_size_in_bits), SVE_VL2, multiplier);
4361 (masm.*fn)(z19.WithLaneSize(lane_size_in_bits), SVE_VL3, multiplier);
4362 (masm.*fn)(z20.WithLaneSize(lane_size_in_bits), SVE_VL4, multiplier);
4363 (masm.*fn)(z21.WithLaneSize(lane_size_in_bits), SVE_VL7, multiplier);
4364 (masm.*fn)(z22.WithLaneSize(lane_size_in_bits), SVE_VL8, multiplier);
4365 (masm.*fn)(z23.WithLaneSize(lane_size_in_bits), SVE_VL16, multiplier);
4366 (masm.*fn)(z24.WithLaneSize(lane_size_in_bits), SVE_VL64, multiplier);
4367 (masm.*fn)(z25.WithLaneSize(lane_size_in_bits), SVE_VL256, multiplier);
4368 (masm.*fn)(z26.WithLaneSize(lane_size_in_bits), 16, multiplier);
4369 (masm.*fn)(z27.WithLaneSize(lane_size_in_bits), 28, multiplier);
4370 (masm.*fn)(z28.WithLaneSize(lane_size_in_bits), SVE_MUL3, multiplier);
4371 (masm.*fn)(z29.WithLaneSize(lane_size_in_bits), SVE_MUL4, multiplier);
4372 (masm.*fn)(z30.WithLaneSize(lane_size_in_bits), SVE_ALL, multiplier);
4373
4374 // Perform computation using alternative instructions.
4375 (masm.*cnt)(x0, SVE_POW2, multiplier);
4376 (masm.*cnt)(x1, SVE_VL1, multiplier);
4377 (masm.*cnt)(x2, SVE_VL2, multiplier);
4378 (masm.*cnt)(x3, SVE_VL3, multiplier);
4379 (masm.*cnt)(x4, SVE_VL4, multiplier);
4380 (masm.*cnt)(x5, SVE_VL7, multiplier);
4381 (masm.*cnt)(x6, SVE_VL8, multiplier);
4382 (masm.*cnt)(x7, SVE_VL16, multiplier);
4383 (masm.*cnt)(x8, SVE_VL64, multiplier);
4384 (masm.*cnt)(x9, SVE_VL256, multiplier);
4385 (masm.*cnt)(x10, 16, multiplier);
4386 (masm.*cnt)(x11, 28, multiplier);
4387 (masm.*cnt)(x12, SVE_MUL3, multiplier);
4388 (masm.*cnt)(x13, SVE_MUL4, multiplier);
4389 (masm.*cnt)(x14, SVE_ALL, multiplier);
4390
4391 ZRegister zscratch = z15.WithLaneSize(lane_size_in_bits);
4392 for (unsigned i = 0; i < 15; i++) {
4393 ZRegister zsrcdst = ZRegister(i, lane_size_in_bits);
4394 Register x = Register(i, kXRegSize);
4395 __ Dup(zscratch, x);
4396 (masm.*addsub)(zsrcdst, zsrcdst, zscratch);
4397 }
4398
4399 END();
4400
4401 if (CAN_RUN()) {
4402 RUN();
4403
4404 ASSERT_EQUAL_SVE(z0, z16);
4405 ASSERT_EQUAL_SVE(z1, z17);
4406 ASSERT_EQUAL_SVE(z2, z18);
4407 ASSERT_EQUAL_SVE(z3, z19);
4408 ASSERT_EQUAL_SVE(z4, z20);
4409 ASSERT_EQUAL_SVE(z5, z21);
4410 ASSERT_EQUAL_SVE(z6, z22);
4411 ASSERT_EQUAL_SVE(z7, z23);
4412 ASSERT_EQUAL_SVE(z8, z24);
4413 ASSERT_EQUAL_SVE(z9, z25);
4414 ASSERT_EQUAL_SVE(z10, z26);
4415 ASSERT_EQUAL_SVE(z11, z27);
4416 ASSERT_EQUAL_SVE(z12, z28);
4417 ASSERT_EQUAL_SVE(z13, z29);
4418 ASSERT_EQUAL_SVE(z14, z30);
4419 }
4420}
4421
4422TEST_SVE(sve_inc_dec_vec) {
4423 CntFn cnth = &MacroAssembler::Cnth;
4424 CntFn cntw = &MacroAssembler::Cntw;
4425 CntFn cntd = &MacroAssembler::Cntd;
4426 AddSubFn sub = &MacroAssembler::Sub;
4427 AddSubFn add = &MacroAssembler::Add;
4428 for (int mult = 1; mult <= 16; mult += 5) {
4429 IncDecZHelper(config, &MacroAssembler::Dech, cnth, sub, mult, kHRegSize);
4430 IncDecZHelper(config, &MacroAssembler::Decw, cntw, sub, mult, kSRegSize);
4431 IncDecZHelper(config, &MacroAssembler::Decd, cntd, sub, mult, kDRegSize);
4432 IncDecZHelper(config, &MacroAssembler::Inch, cnth, add, mult, kHRegSize);
4433 IncDecZHelper(config, &MacroAssembler::Incw, cntw, add, mult, kSRegSize);
4434 IncDecZHelper(config, &MacroAssembler::Incd, cntd, add, mult, kDRegSize);
4435 }
4436}
4437
4438TEST_SVE(sve_unsigned_sat_inc_dec_vec) {
4439 CntFn cnth = &MacroAssembler::Cnth;
4440 CntFn cntw = &MacroAssembler::Cntw;
4441 CntFn cntd = &MacroAssembler::Cntd;
4442 AddSubFn sub = &MacroAssembler::Uqsub;
4443 AddSubFn add = &MacroAssembler::Uqadd;
4444 for (int mult = 1; mult <= 16; mult += 5) {
4445 IncDecZHelper(config, &MacroAssembler::Uqdech, cnth, sub, mult, kHRegSize);
4446 IncDecZHelper(config, &MacroAssembler::Uqdecw, cntw, sub, mult, kSRegSize);
4447 IncDecZHelper(config, &MacroAssembler::Uqdecd, cntd, sub, mult, kDRegSize);
4448 IncDecZHelper(config, &MacroAssembler::Uqinch, cnth, add, mult, kHRegSize);
4449 IncDecZHelper(config, &MacroAssembler::Uqincw, cntw, add, mult, kSRegSize);
4450 IncDecZHelper(config, &MacroAssembler::Uqincd, cntd, add, mult, kDRegSize);
4451 }
4452}
4453
4454TEST_SVE(sve_signed_sat_inc_dec_vec) {
4455 CntFn cnth = &MacroAssembler::Cnth;
4456 CntFn cntw = &MacroAssembler::Cntw;
4457 CntFn cntd = &MacroAssembler::Cntd;
4458 AddSubFn sub = &MacroAssembler::Sqsub;
4459 AddSubFn add = &MacroAssembler::Sqadd;
4460 for (int mult = 1; mult <= 16; mult += 5) {
4461 IncDecZHelper(config, &MacroAssembler::Sqdech, cnth, sub, mult, kHRegSize);
4462 IncDecZHelper(config, &MacroAssembler::Sqdecw, cntw, sub, mult, kSRegSize);
4463 IncDecZHelper(config, &MacroAssembler::Sqdecd, cntd, sub, mult, kDRegSize);
4464 IncDecZHelper(config, &MacroAssembler::Sqinch, cnth, add, mult, kHRegSize);
4465 IncDecZHelper(config, &MacroAssembler::Sqincw, cntw, add, mult, kSRegSize);
4466 IncDecZHelper(config, &MacroAssembler::Sqincd, cntd, add, mult, kDRegSize);
4467 }
4468}
4469
TatWai Chong7a0d3672019-10-23 17:35:18 -07004470typedef void (MacroAssembler::*ArithPredicatedFn)(const ZRegister& zd,
4471 const PRegisterM& pg,
4472 const ZRegister& zn,
4473 const ZRegister& zm);
TatWai Chong13634762019-07-16 16:20:45 -07004474
4475template <typename Td, typename Tg, typename Tn>
4476static void IntBinArithHelper(Test* config,
TatWai Chong7a0d3672019-10-23 17:35:18 -07004477 ArithPredicatedFn macro,
TatWai Chong13634762019-07-16 16:20:45 -07004478 unsigned lane_size_in_bits,
4479 const Tg& pg_inputs,
4480 const Tn& zn_inputs,
4481 const Tn& zm_inputs,
4482 const Td& zd_expected) {
4483 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4484 START();
4485
4486 ZRegister src_a = z31.WithLaneSize(lane_size_in_bits);
4487 ZRegister src_b = z27.WithLaneSize(lane_size_in_bits);
4488 InsrHelper(&masm, src_a, zn_inputs);
4489 InsrHelper(&masm, src_b, zm_inputs);
4490
4491 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
4492
4493 ZRegister zd_1 = z0.WithLaneSize(lane_size_in_bits);
4494 ZRegister zd_2 = z1.WithLaneSize(lane_size_in_bits);
4495 ZRegister zd_3 = z2.WithLaneSize(lane_size_in_bits);
4496
4497 // `instr` zd(dst), zd(src_a), zn(src_b)
4498 __ Mov(zd_1, src_a);
4499 (masm.*macro)(zd_1, p0.Merging(), zd_1, src_b);
4500
4501 // `instr` zd(dst), zm(src_a), zd(src_b)
4502 // Based on whether zd and zm registers are aliased, the macro of instructions
4503 // (`Instr`) swaps the order of operands if it has the commutative property,
4504 // otherwise, transfer to the reversed `Instr`, such as subr and divr.
4505 __ Mov(zd_2, src_b);
4506 (masm.*macro)(zd_2, p0.Merging(), src_a, zd_2);
4507
4508 // `instr` zd(dst), zm(src_a), zn(src_b)
4509 // The macro of instructions (`Instr`) automatically selects between `instr`
4510 // and movprfx + `instr` based on whether zd and zn registers are aliased.
TatWai Chongd316c5e2019-10-16 12:22:10 -07004511 // A generated movprfx instruction is predicated that using the same
TatWai Chong13634762019-07-16 16:20:45 -07004512 // governing predicate register. In order to keep the result constant,
4513 // initialize the destination register first.
4514 __ Mov(zd_3, src_a);
4515 (masm.*macro)(zd_3, p0.Merging(), src_a, src_b);
4516
4517 END();
4518
4519 if (CAN_RUN()) {
4520 RUN();
4521 ASSERT_EQUAL_SVE(zd_expected, zd_1);
4522
4523 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
4524 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
4525 if (!core.HasSVELane(zd_1, lane)) break;
TatWai Chongd316c5e2019-10-16 12:22:10 -07004526 if ((pg_inputs[i] & 1) != 0) {
TatWai Chong13634762019-07-16 16:20:45 -07004527 ASSERT_EQUAL_SVE_LANE(zd_expected[i], zd_1, lane);
4528 } else {
4529 ASSERT_EQUAL_SVE_LANE(zn_inputs[i], zd_1, lane);
4530 }
4531 }
4532
4533 ASSERT_EQUAL_SVE(zd_expected, zd_3);
4534 }
4535}
4536
4537TEST_SVE(sve_binary_arithmetic_predicated_add) {
4538 // clang-format off
4539 unsigned zn_b[] = {0x00, 0x01, 0x10, 0x81, 0xff, 0x0f, 0x01, 0x7f};
4540
4541 unsigned zm_b[] = {0x00, 0x01, 0x10, 0x00, 0x81, 0x80, 0xff, 0xff};
4542
4543 unsigned zn_h[] = {0x0000, 0x0123, 0x1010, 0x8181, 0xffff, 0x0f0f, 0x0101, 0x7f7f};
4544
4545 unsigned zm_h[] = {0x0000, 0x0123, 0x1010, 0x0000, 0x8181, 0x8080, 0xffff, 0xffff};
4546
4547 unsigned zn_s[] = {0x00000000, 0x01234567, 0x10101010, 0x81818181,
4548 0xffffffff, 0x0f0f0f0f, 0x01010101, 0x7f7f7f7f};
4549
4550 unsigned zm_s[] = {0x00000000, 0x01234567, 0x10101010, 0x00000000,
4551 0x81818181, 0x80808080, 0xffffffff, 0xffffffff};
4552
4553 uint64_t zn_d[] = {0x0000000000000000, 0x0123456789abcdef,
4554 0x1010101010101010, 0x8181818181818181,
4555 0xffffffffffffffff, 0x0f0f0f0f0f0f0f0f,
4556 0x0101010101010101, 0x7f7f7f7fffffffff};
4557
4558 uint64_t zm_d[] = {0x0000000000000000, 0x0123456789abcdef,
4559 0x1010101010101010, 0x0000000000000000,
4560 0x8181818181818181, 0x8080808080808080,
4561 0xffffffffffffffff, 0xffffffffffffffff};
4562
4563 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4564 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4565 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4566 int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4567
4568 unsigned add_exp_b[] = {0x00, 0x02, 0x20, 0x81, 0x80, 0x8f, 0x00, 0x7f};
4569
4570 unsigned add_exp_h[] = {0x0000, 0x0246, 0x1010, 0x8181,
4571 0x8180, 0x8f8f, 0x0101, 0x7f7e};
4572
4573 unsigned add_exp_s[] = {0x00000000, 0x01234567, 0x20202020, 0x81818181,
4574 0x81818180, 0x0f0f0f0f, 0x01010100, 0x7f7f7f7e};
4575
4576 uint64_t add_exp_d[] = {0x0000000000000000, 0x02468acf13579bde,
4577 0x2020202020202020, 0x8181818181818181,
4578 0xffffffffffffffff, 0x8f8f8f8f8f8f8f8f,
4579 0x0101010101010100, 0x7f7f7f7ffffffffe};
4580
TatWai Chong7a0d3672019-10-23 17:35:18 -07004581 ArithPredicatedFn fn = &MacroAssembler::Add;
TatWai Chong13634762019-07-16 16:20:45 -07004582 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, add_exp_b);
4583 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, add_exp_h);
4584 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, add_exp_s);
4585 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, add_exp_d);
4586
4587 unsigned sub_exp_b[] = {0x00, 0x00, 0x00, 0x81, 0x7e, 0x8f, 0x02, 0x7f};
4588
4589 unsigned sub_exp_h[] = {0x0000, 0x0000, 0x1010, 0x8181,
4590 0x7e7e, 0x8e8f, 0x0101, 0x7f80};
4591
4592 unsigned sub_exp_s[] = {0x00000000, 0x01234567, 0x00000000, 0x81818181,
4593 0x7e7e7e7e, 0x0f0f0f0f, 0x01010102, 0x7f7f7f80};
4594
4595 uint64_t sub_exp_d[] = {0x0000000000000000, 0x0000000000000000,
4596 0x0000000000000000, 0x8181818181818181,
4597 0xffffffffffffffff, 0x8e8e8e8e8e8e8e8f,
4598 0x0101010101010102, 0x7f7f7f8000000000};
4599
4600 fn = &MacroAssembler::Sub;
4601 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sub_exp_b);
4602 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sub_exp_h);
4603 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sub_exp_s);
4604 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sub_exp_d);
4605 // clang-format on
4606}
4607
4608TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd) {
4609 // clang-format off
4610 unsigned zn_b[] = {0x00, 0xff, 0x0f, 0xff, 0xf0, 0x98, 0x55, 0x67};
4611
4612 unsigned zm_b[] = {0x01, 0x00, 0x0e, 0xfe, 0xfe, 0xab, 0xcd, 0x78};
4613
4614 unsigned zn_h[] = {0x0000, 0xffff, 0x00ff, 0xffff,
4615 0xff00, 0xba98, 0x5555, 0x4567};
4616
4617 unsigned zm_h[] = {0x0001, 0x0000, 0x00ee, 0xfffe,
4618 0xfe00, 0xabab, 0xcdcd, 0x5678};
4619
4620 unsigned zn_s[] = {0x00000000, 0xffffffff, 0x0000ffff, 0xffffffff,
4621 0xffff0000, 0xfedcba98, 0x55555555, 0x01234567};
4622
4623 unsigned zm_s[] = {0x00000001, 0x00000000, 0x0000eeee, 0xfffffffe,
4624 0xfffe0000, 0xabababab, 0xcdcdcdcd, 0x12345678};
4625
4626 uint64_t zn_d[] = {0x0000000000000000, 0xffffffffffffffff,
4627 0x5555555555555555, 0x0000000001234567};
4628
4629 uint64_t zm_d[] = {0x0000000000000001, 0x0000000000000000,
4630 0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4631
4632 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4633 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4634 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4635 int pg_d[] = {1, 0, 1, 1};
4636
4637 unsigned umax_exp_b[] = {0x01, 0xff, 0x0f, 0xff, 0xfe, 0xab, 0xcd, 0x67};
4638
4639 unsigned umax_exp_h[] = {0x0001, 0xffff, 0x00ff, 0xffff,
4640 0xff00, 0xba98, 0x5555, 0x5678};
4641
4642 unsigned umax_exp_s[] = {0x00000001, 0xffffffff, 0x0000ffff, 0xffffffff,
4643 0xffff0000, 0xfedcba98, 0xcdcdcdcd, 0x12345678};
4644
4645 uint64_t umax_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4646 0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4647
TatWai Chong7a0d3672019-10-23 17:35:18 -07004648 ArithPredicatedFn fn = &MacroAssembler::Umax;
TatWai Chong13634762019-07-16 16:20:45 -07004649 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umax_exp_b);
4650 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umax_exp_h);
4651 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umax_exp_s);
4652 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umax_exp_d);
4653
4654 unsigned umin_exp_b[] = {0x00, 0x00, 0x0e, 0xff, 0xf0, 0x98, 0x55, 0x67};
4655
4656 unsigned umin_exp_h[] = {0x0000, 0x0000, 0x00ff, 0xfffe,
4657 0xfe00, 0xabab, 0x5555, 0x4567};
4658
4659 unsigned umin_exp_s[] = {0x00000000, 0xffffffff, 0x0000eeee, 0xfffffffe,
4660 0xfffe0000, 0xfedcba98, 0x55555555, 0x01234567};
4661
4662 uint64_t umin_exp_d[] = {0x0000000000000000, 0xffffffffffffffff,
4663 0x5555555555555555, 0x0000000001234567};
4664 fn = &MacroAssembler::Umin;
4665 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umin_exp_b);
4666 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umin_exp_h);
4667 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umin_exp_s);
4668 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umin_exp_d);
4669
4670 unsigned uabd_exp_b[] = {0x01, 0xff, 0x01, 0xff, 0x0e, 0x13, 0x78, 0x67};
4671
4672 unsigned uabd_exp_h[] = {0x0001, 0xffff, 0x00ff, 0x0001,
4673 0x0100, 0x0eed, 0x5555, 0x1111};
4674
4675 unsigned uabd_exp_s[] = {0x00000001, 0xffffffff, 0x00001111, 0x00000001,
4676 0x00010000, 0xfedcba98, 0x78787878, 0x11111111};
4677
4678 uint64_t uabd_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4679 0x7878787878787878, 0x0000000011111111};
4680
4681 fn = &MacroAssembler::Uabd;
4682 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, uabd_exp_b);
4683 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, uabd_exp_h);
4684 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, uabd_exp_s);
4685 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, uabd_exp_d);
4686 // clang-format on
4687}
4688
4689TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd) {
4690 // clang-format off
4691 int zn_b[] = {0, -128, -128, -128, -128, 127, 127, 1};
4692
4693 int zm_b[] = {-1, 0, -1, -127, 127, 126, -1, 0};
4694
4695 int zn_h[] = {0, INT16_MIN, INT16_MIN, INT16_MIN,
4696 INT16_MIN, INT16_MAX, INT16_MAX, 1};
4697
4698 int zm_h[] = {-1, 0, -1, INT16_MIN + 1,
4699 INT16_MAX, INT16_MAX - 1, -1, 0};
4700
4701 int zn_s[] = {0, INT32_MIN, INT32_MIN, INT32_MIN,
4702 INT32_MIN, INT32_MAX, INT32_MAX, 1};
4703
4704 int zm_s[] = {-1, 0, -1, -INT32_MAX,
4705 INT32_MAX, INT32_MAX - 1, -1, 0};
4706
4707 int64_t zn_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4708 INT64_MIN, INT64_MAX, INT64_MAX, 1};
4709
4710 int64_t zm_d[] = {-1, 0, -1, INT64_MIN + 1,
4711 INT64_MAX, INT64_MAX - 1, -1, 0};
4712
4713 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4714 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4715 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4716 int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4717
4718 int smax_exp_b[] = {0, 0, -1, -128, 127, 127, 127, 1};
4719
4720 int smax_exp_h[] = {0, 0, INT16_MIN, INT16_MIN + 1,
4721 INT16_MAX, INT16_MAX, INT16_MAX, 1};
4722
4723 int smax_exp_s[] = {0, INT32_MIN, -1, INT32_MIN + 1,
4724 INT32_MAX, INT32_MAX, INT32_MAX, 1};
4725
4726 int64_t smax_exp_d[] = {0, 0, -1, INT64_MIN + 1,
4727 INT64_MIN, INT64_MAX, INT64_MAX, 1};
4728
TatWai Chong7a0d3672019-10-23 17:35:18 -07004729 ArithPredicatedFn fn = &MacroAssembler::Smax;
TatWai Chong13634762019-07-16 16:20:45 -07004730 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smax_exp_b);
4731 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smax_exp_h);
4732 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smax_exp_s);
4733 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smax_exp_d);
4734
4735 int smin_exp_b[] = {-1, -128, -128, -128, -128, 126, -1, 1};
4736
4737 int smin_exp_h[] = {-1, INT16_MIN, INT16_MIN, INT16_MIN,
4738 INT16_MIN, INT16_MAX - 1, INT16_MAX, 0};
4739
4740 int smin_exp_s[] = {-1, INT32_MIN, INT32_MIN, INT32_MIN,
4741 INT32_MIN, INT32_MAX, -1, 0};
4742
4743 int64_t smin_exp_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4744 INT64_MIN, INT64_MAX - 1, -1, 0};
4745
4746 fn = &MacroAssembler::Smin;
4747 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smin_exp_b);
4748 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smin_exp_h);
4749 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smin_exp_s);
4750 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smin_exp_d);
4751
4752 unsigned sabd_exp_b[] = {1, 128, 127, 128, 255, 1, 128, 1};
4753
4754 unsigned sabd_exp_h[] = {1, 0x8000, 0x8000, 1, 0xffff, 1, 0x7fff, 1};
4755
4756 unsigned sabd_exp_s[] = {1, 0x80000000, 0x7fffffff, 1,
4757 0xffffffff, 0x7fffffff, 0x80000000, 1};
4758
4759 uint64_t sabd_exp_d[] = {0, 0x8000000000000000, 0x7fffffffffffffff, 1,
4760 0x8000000000000000, 1, 0x8000000000000000, 1};
4761
4762 fn = &MacroAssembler::Sabd;
4763 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sabd_exp_b);
4764 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sabd_exp_h);
4765 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sabd_exp_s);
4766 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sabd_exp_d);
4767 // clang-format on
4768}
4769
4770TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh) {
4771 // clang-format off
4772 unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4773
4774 unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4775
4776 unsigned zn_h[] = {0x0000, 0x0001, 0x0020, 0x0800,
4777 0x8000, 0xff00, 0x5555, 0xaaaa};
4778
4779 unsigned zm_h[] = {0x007f, 0x00cd, 0x0800, 0xffff,
4780 0x5555, 0xaaaa, 0x0001, 0x1234};
4781
4782 unsigned zn_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4783 0x12345678, 0xffffffff, 0x55555555, 0xaaaaaaaa};
4784
4785 unsigned zm_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4786 0x12345678, 0x22223333, 0x55556666, 0x77778888};
4787
4788 uint64_t zn_d[] = {0x0000000000000000, 0x5555555555555555,
4789 0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa};
4790
4791 uint64_t zm_d[] = {0x0000000000000000, 0x1111111133333333,
4792 0xddddddddeeeeeeee, 0xaaaaaaaaaaaaaaaa};
4793
4794 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4795 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4796 int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4797 int pg_d[] = {1, 1, 0, 1};
4798
4799 unsigned mul_exp_b[] = {0x00, 0xcd, 0x00, 0xf8, 0x80, 0x56, 0x00, 0x50};
4800
4801 unsigned mul_exp_h[] = {0x0000, 0x0001, 0x0000, 0xf800,
4802 0x8000, 0xff00, 0x5555, 0x9e88};
4803
4804 unsigned mul_exp_s[] = {0x00000000, 0x00000001, 0x00200020, 0x00400000,
4805 0x1df4d840, 0xddddcccd, 0x55555555, 0xb05afa50};
4806
4807 uint64_t mul_exp_d[] = {0x0000000000000000, 0xa4fa4fa4eeeeeeef,
4808 0xffffffffffffffff, 0x38e38e38e38e38e4};
4809
TatWai Chong7a0d3672019-10-23 17:35:18 -07004810 ArithPredicatedFn fn = &MacroAssembler::Mul;
TatWai Chong13634762019-07-16 16:20:45 -07004811 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, mul_exp_b);
4812 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, mul_exp_h);
4813 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, mul_exp_s);
4814 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, mul_exp_d);
4815
4816 unsigned umulh_exp_b[] = {0x00, 0x00, 0x10, 0x07, 0x80, 0xa9, 0x00, 0x05};
4817
4818 unsigned umulh_exp_h[] = {0x0000, 0x0001, 0x0001, 0x07ff,
4819 0x2aaa, 0xff00, 0x0000, 0x0c22};
4820
4821 unsigned umulh_exp_s[] = {0x00000000, 0x00000000, 0x00200020, 0x00400080,
4822 0x014b66dc, 0x22223332, 0x55555555, 0x4fa505af};
4823
4824 uint64_t umulh_exp_d[] = {0x0000000000000000, 0x05b05b05bbbbbbbb,
4825 0xffffffffffffffff, 0x71c71c71c71c71c6};
4826
4827 fn = &MacroAssembler::Umulh;
4828 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umulh_exp_b);
4829 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umulh_exp_h);
4830 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umulh_exp_s);
4831 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umulh_exp_d);
4832 // clang-format on
4833}
4834
4835TEST_SVE(sve_binary_arithmetic_predicated_smulh) {
4836 // clang-format off
4837 int zn_b[] = {0, 1, -1, INT8_MIN, INT8_MAX, -1, 100, -3};
4838
4839 int zm_b[] = {0, INT8_MIN, INT8_MIN, INT8_MAX, INT8_MAX, -1, 2, 66};
4840
4841 int zn_h[] = {0, 1, -1, INT16_MIN, INT16_MAX, -1, 10000, -3};
4842
4843 int zm_h[] = {0, INT16_MIN, INT16_MIN, INT16_MAX, INT16_MAX, -1, 2, 6666};
4844
4845 int zn_s[] = {0, 1, -1, INT32_MIN, INT32_MAX, -1, 100000000, -3};
4846
4847 int zm_s[] = {0, INT32_MIN, INT32_MIN, INT32_MAX, INT32_MAX, -1, 2, 66666666};
4848
4849 int64_t zn_d[] = {0, -1, INT64_MIN, INT64_MAX};
4850
4851 int64_t zm_d[] = {INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX};
4852
4853 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4854 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4855 int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4856 int pg_d[] = {1, 1, 0, 1};
4857
4858 int exp_b[] = {0, -1, 0, -64, INT8_MAX, 0, 0, -1};
4859
4860 int exp_h[] = {0, 1, 0, -16384, 16383, -1, 0, -1};
4861
4862 int exp_s[] = {0, -1, -1, -1073741824, 1073741823, 0, 100000000, -1};
4863
4864 int64_t exp_d[] = {0, -1, INT64_MIN, 4611686018427387903};
4865
TatWai Chong7a0d3672019-10-23 17:35:18 -07004866 ArithPredicatedFn fn = &MacroAssembler::Smulh;
TatWai Chong13634762019-07-16 16:20:45 -07004867 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, exp_b);
4868 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, exp_h);
4869 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
4870 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
4871 // clang-format on
4872}
4873
4874TEST_SVE(sve_binary_arithmetic_predicated_logical) {
4875 // clang-format off
4876 unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4877 unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4878
4879 unsigned zn_h[] = {0x0000, 0x0001, 0x2020, 0x0008,
4880 0x8000, 0xffff, 0x5555, 0xaaaa};
4881 unsigned zm_h[] = {0x7fff, 0xabcd, 0x8000, 0xffff,
4882 0x5555, 0xaaaa, 0x0000, 0x0800};
4883
4884 unsigned zn_s[] = {0x00000001, 0x20200008, 0x8000ffff, 0x5555aaaa};
4885 unsigned zm_s[] = {0x7fffabcd, 0x8000ffff, 0x5555aaaa, 0x00000800};
4886
4887 uint64_t zn_d[] = {0xfedcba9876543210, 0x0123456789abcdef,
4888 0x0001200880ff55aa, 0x0022446688aaccee};
4889 uint64_t zm_d[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff,
4890 0x7fcd80ff55aa0008, 0x1133557799bbddff};
4891
4892 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4893 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4894 int pg_s[] = {1, 1, 1, 0};
4895 int pg_d[] = {1, 1, 0, 1};
4896
4897 unsigned and_exp_b[] = {0x00, 0x01, 0x00, 0x08, 0x80, 0xaa, 0x00, 0x08};
4898
4899 unsigned and_exp_h[] = {0x0000, 0x0001, 0x0000, 0x0008,
4900 0x0000, 0xffff, 0x0000, 0x0800};
4901
4902 unsigned and_exp_s[] = {0x00000001, 0x00000008, 0x0000aaaa, 0x5555aaaa};
4903
4904 uint64_t and_exp_d[] = {0xfedcaa8854540000, 0x0000454588aacdef,
4905 0x0001200880ff55aa, 0x0022446688aaccee};
4906
TatWai Chong7a0d3672019-10-23 17:35:18 -07004907 ArithPredicatedFn fn = &MacroAssembler::And;
TatWai Chong13634762019-07-16 16:20:45 -07004908 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, and_exp_b);
4909 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, and_exp_h);
4910 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, and_exp_s);
4911 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, and_exp_d);
4912
4913 unsigned bic_exp_b[] = {0x00, 0x00, 0x20, 0x00, 0x80, 0x55, 0x55, 0xa2};
4914
4915 unsigned bic_exp_h[] = {0x0000, 0x0001, 0x2020, 0x0000,
4916 0x8000, 0xffff, 0x5555, 0xa2aa};
4917
4918 unsigned bic_exp_s[] = {0x00000000, 0x20200000, 0x80005555, 0x5555aaaa};
4919
4920 uint64_t bic_exp_d[] = {0x0000101022003210, 0x0123002201010000,
4921 0x0001200880ff55aa, 0x0000000000000000};
4922
4923 fn = &MacroAssembler::Bic;
4924 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, bic_exp_b);
4925 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, bic_exp_h);
4926 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, bic_exp_s);
4927 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, bic_exp_d);
4928
4929 unsigned eor_exp_b[] = {0x00, 0xcc, 0xa0, 0xf7, 0x80, 0x55, 0x55, 0xa2};
4930
4931 unsigned eor_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xfff7,
4932 0xd555, 0xffff, 0x5555, 0xa2aa};
4933
4934 unsigned eor_exp_s[] = {0x7fffabcc, 0xa020fff7, 0xd5555555, 0x5555aaaa};
4935
4936 uint64_t eor_exp_d[] = {0x01235476ab89fedc, 0xcdef98ba67453210,
4937 0x0001200880ff55aa, 0x1111111111111111};
4938
4939 fn = &MacroAssembler::Eor;
4940 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, eor_exp_b);
4941 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, eor_exp_h);
4942 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, eor_exp_s);
4943 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, eor_exp_d);
4944
4945 unsigned orr_exp_b[] = {0x00, 0xcd, 0xa0, 0xff, 0x80, 0xff, 0x55, 0xaa};
4946
4947 unsigned orr_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xffff,
4948 0xd555, 0xffff, 0x5555, 0xaaaa};
4949
4950 unsigned orr_exp_s[] = {0x7fffabcd, 0xa020ffff, 0xd555ffff, 0x5555aaaa};
4951
4952 uint64_t orr_exp_d[] = {0xfffffefeffddfedc, 0xcdefddffefefffff,
4953 0x0001200880ff55aa, 0x1133557799bbddff};
4954
4955 fn = &MacroAssembler::Orr;
4956 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, orr_exp_b);
4957 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, orr_exp_h);
4958 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, orr_exp_s);
4959 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, orr_exp_d);
4960 // clang-format on
4961}
4962
4963TEST_SVE(sve_binary_arithmetic_predicated_sdiv) {
4964 // clang-format off
4965 int zn_s[] = {0, 1, -1, 2468,
4966 INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX,
4967 -11111111, 87654321, 0, 0};
4968
4969 int zm_s[] = {1, -1, 1, 1234,
4970 -1, INT32_MIN, 1, -1,
4971 22222222, 80000000, -1, 0};
4972
4973 int64_t zn_d[] = {0, 1, -1, 2468,
4974 INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX,
4975 -11111111, 87654321, 0, 0};
4976
4977 int64_t zm_d[] = {1, -1, 1, 1234,
4978 -1, INT64_MIN, 1, -1,
4979 22222222, 80000000, -1, 0};
4980
4981 int pg_s[] = {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0};
4982 int pg_d[] = {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1};
4983
4984 int exp_s[] = {0, 1, -1, 2,
4985 INT32_MIN, 0, INT32_MIN, -INT32_MAX,
4986 0, 1, 0, 0};
4987
4988 int64_t exp_d[] = {0, -1, -1, 2,
4989 INT64_MIN, INT64_MAX, INT64_MIN, -INT64_MAX,
4990 0, 1, 0, 0};
4991
TatWai Chong7a0d3672019-10-23 17:35:18 -07004992 ArithPredicatedFn fn = &MacroAssembler::Sdiv;
TatWai Chong13634762019-07-16 16:20:45 -07004993 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
4994 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
4995 // clang-format on
4996}
4997
4998TEST_SVE(sve_binary_arithmetic_predicated_udiv) {
4999 // clang-format off
5000 unsigned zn_s[] = {0x00000000, 0x00000001, 0xffffffff, 0x80000000,
5001 0xffffffff, 0x80000000, 0xffffffff, 0x0000f000};
5002
5003 unsigned zm_s[] = {0x00000001, 0xffffffff, 0x80000000, 0x00000002,
5004 0x00000000, 0x00000001, 0x00008000, 0xf0000000};
5005
5006 uint64_t zn_d[] = {0x0000000000000000, 0x0000000000000001,
5007 0xffffffffffffffff, 0x8000000000000000,
5008 0xffffffffffffffff, 0x8000000000000000,
5009 0xffffffffffffffff, 0xf0000000f0000000};
5010
5011 uint64_t zm_d[] = {0x0000000000000001, 0xffffffff00000000,
5012 0x8000000000000000, 0x0000000000000002,
5013 0x8888888888888888, 0x0000000000000001,
5014 0x0000000080000000, 0x00000000f0000000};
5015
5016 int pg_s[] = {1, 1, 0, 1, 1, 0, 1, 1};
5017 int pg_d[] = {1, 0, 1, 1, 1, 1, 0, 1};
5018
5019 unsigned exp_s[] = {0x00000000, 0x00000000, 0xffffffff, 0x40000000,
5020 0x00000000, 0x80000000, 0x0001ffff, 0x00000000};
5021
5022 uint64_t exp_d[] = {0x0000000000000000, 0x0000000000000001,
5023 0x0000000000000001, 0x4000000000000000,
5024 0x0000000000000001, 0x8000000000000000,
5025 0xffffffffffffffff, 0x0000000100000001};
5026
TatWai Chong7a0d3672019-10-23 17:35:18 -07005027 ArithPredicatedFn fn = &MacroAssembler::Udiv;
TatWai Chong13634762019-07-16 16:20:45 -07005028 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
5029 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
5030 // clang-format on
5031}
5032
TatWai Chong7a0d3672019-10-23 17:35:18 -07005033typedef void (MacroAssembler::*ArithFn)(const ZRegister& zd,
5034 const ZRegister& zn,
5035 const ZRegister& zm);
TatWai Chong845246b2019-08-08 00:01:58 -07005036
5037template <typename T>
5038static void IntArithHelper(Test* config,
TatWai Chong7a0d3672019-10-23 17:35:18 -07005039 ArithFn macro,
TatWai Chong845246b2019-08-08 00:01:58 -07005040 unsigned lane_size_in_bits,
5041 const T& zn_inputs,
5042 const T& zm_inputs,
5043 const T& zd_expected) {
5044 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5045 START();
5046
5047 ZRegister zn = z31.WithLaneSize(lane_size_in_bits);
5048 ZRegister zm = z27.WithLaneSize(lane_size_in_bits);
5049 InsrHelper(&masm, zn, zn_inputs);
5050 InsrHelper(&masm, zm, zm_inputs);
5051
5052 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
5053 (masm.*macro)(zd, zn, zm);
5054
5055 END();
5056
5057 if (CAN_RUN()) {
5058 RUN();
5059 ASSERT_EQUAL_SVE(zd_expected, zd);
5060 }
5061}
5062
5063TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd) {
5064 // clang-format off
TatWai Chong6995bfd2019-09-26 10:48:05 +01005065 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xaa, 0x55, 0xff, 0xf0};
5066 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa, 0x5555, 0xffff, 0xf0f0};
5067 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0x10001010, 0xaaaaaaaa, 0xf000f0f0};
5068 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
TatWai Chong845246b2019-08-08 00:01:58 -07005069 0x1000000010001010, 0xf0000000f000f0f0};
5070
TatWai Chong7a0d3672019-10-23 17:35:18 -07005071 ArithFn fn = &MacroAssembler::Add;
TatWai Chong845246b2019-08-08 00:01:58 -07005072
5073 unsigned add_exp_b[] = {0x02, 0xfe, 0x20, 0x54, 0xaa, 0xfe, 0xe0};
5074 unsigned add_exp_h[] = {0x0302, 0xfefe, 0x2020, 0x5554, 0xaaaa, 0xfffe, 0xe1e0};
5075 unsigned add_exp_s[] = {0x00030302, 0xfffefefe, 0x20002020, 0x55555554, 0xe001e1e0};
5076 uint64_t add_exp_d[] = {0x0000000300030302, 0xfffffffefffefefe,
5077 0x2000000020002020, 0xe0000001e001e1e0};
5078
TatWai Chong6995bfd2019-09-26 10:48:05 +01005079 IntArithHelper(config, fn, kBRegSize, in_b, in_b, add_exp_b);
5080 IntArithHelper(config, fn, kHRegSize, in_h, in_h, add_exp_h);
5081 IntArithHelper(config, fn, kSRegSize, in_s, in_s, add_exp_s);
5082 IntArithHelper(config, fn, kDRegSize, in_d, in_d, add_exp_d);
TatWai Chong845246b2019-08-08 00:01:58 -07005083
5084 fn = &MacroAssembler::Sqadd;
5085
5086 unsigned sqadd_exp_b[] = {0x80, 0x7f, 0x20, 0x80, 0x7f, 0xfe, 0xe0};
5087 unsigned sqadd_exp_h[] = {0x8000, 0x7fff, 0x2020, 0x8000, 0x7fff, 0xfffe, 0xe1e0};
5088 unsigned sqadd_exp_s[] = {0x80000000, 0x7fffffff, 0x20002020, 0x80000000, 0xe001e1e0};
5089 uint64_t sqadd_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5090 0x2000000020002020, 0xe0000001e001e1e0};
5091
TatWai Chong6995bfd2019-09-26 10:48:05 +01005092 IntArithHelper(config, fn, kBRegSize, in_b, in_b, sqadd_exp_b);
5093 IntArithHelper(config, fn, kHRegSize, in_h, in_h, sqadd_exp_h);
5094 IntArithHelper(config, fn, kSRegSize, in_s, in_s, sqadd_exp_s);
5095 IntArithHelper(config, fn, kDRegSize, in_d, in_d, sqadd_exp_d);
TatWai Chong845246b2019-08-08 00:01:58 -07005096
5097 fn = &MacroAssembler::Uqadd;
5098
5099 unsigned uqadd_exp_b[] = {0xff, 0xfe, 0x20, 0xff, 0xaa, 0xff, 0xff};
5100 unsigned uqadd_exp_h[] = {0xffff, 0xfefe, 0x2020, 0xffff, 0xaaaa, 0xffff, 0xffff};
5101 unsigned uqadd_exp_s[] = {0xffffffff, 0xfffefefe, 0x20002020, 0xffffffff, 0xffffffff};
5102 uint64_t uqadd_exp_d[] = {0xffffffffffffffff, 0xfffffffefffefefe,
5103 0x2000000020002020, 0xffffffffffffffff};
5104
TatWai Chong6995bfd2019-09-26 10:48:05 +01005105 IntArithHelper(config, fn, kBRegSize, in_b, in_b, uqadd_exp_b);
5106 IntArithHelper(config, fn, kHRegSize, in_h, in_h, uqadd_exp_h);
5107 IntArithHelper(config, fn, kSRegSize, in_s, in_s, uqadd_exp_s);
5108 IntArithHelper(config, fn, kDRegSize, in_d, in_d, uqadd_exp_d);
TatWai Chong845246b2019-08-08 00:01:58 -07005109 // clang-format on
5110}
5111
5112TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub) {
5113 // clang-format off
5114
5115 unsigned ins1_b[] = {0x81, 0x7f, 0x7e, 0xaa};
5116 unsigned ins2_b[] = {0x10, 0xf0, 0xf0, 0x55};
5117
5118 unsigned ins1_h[] = {0x8181, 0x7f7f, 0x7e7e, 0xaaaa};
5119 unsigned ins2_h[] = {0x1010, 0xf0f0, 0xf0f0, 0x5555};
5120
5121 unsigned ins1_s[] = {0x80018181, 0x7fff7f7f, 0x7eee7e7e, 0xaaaaaaaa};
5122 unsigned ins2_s[] = {0x10001010, 0xf000f0f0, 0xf000f0f0, 0x55555555};
5123
5124 uint64_t ins1_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
5125 0x7eeeeeee7eee7e7e, 0xaaaaaaaaaaaaaaaa};
5126 uint64_t ins2_d[] = {0x1000000010001010, 0xf0000000f000f0f0,
5127 0xf0000000f000f0f0, 0x5555555555555555};
5128
TatWai Chong7a0d3672019-10-23 17:35:18 -07005129 ArithFn fn = &MacroAssembler::Sub;
TatWai Chong845246b2019-08-08 00:01:58 -07005130
5131 unsigned ins1_sub_ins2_exp_b[] = {0x71, 0x8f, 0x8e, 0x55};
5132 unsigned ins1_sub_ins2_exp_h[] = {0x7171, 0x8e8f, 0x8d8e, 0x5555};
5133 unsigned ins1_sub_ins2_exp_s[] = {0x70017171, 0x8ffe8e8f, 0x8eed8d8e, 0x55555555};
5134 uint64_t ins1_sub_ins2_exp_d[] = {0x7000000170017171, 0x8ffffffe8ffe8e8f,
5135 0x8eeeeeed8eed8d8e, 0x5555555555555555};
5136
5137 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sub_ins2_exp_b);
5138 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sub_ins2_exp_h);
5139 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sub_ins2_exp_s);
5140 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sub_ins2_exp_d);
5141
5142 unsigned ins2_sub_ins1_exp_b[] = {0x8f, 0x71, 0x72, 0xab};
5143 unsigned ins2_sub_ins1_exp_h[] = {0x8e8f, 0x7171, 0x7272, 0xaaab};
5144 unsigned ins2_sub_ins1_exp_s[] = {0x8ffe8e8f, 0x70017171, 0x71127272, 0xaaaaaaab};
5145 uint64_t ins2_sub_ins1_exp_d[] = {0x8ffffffe8ffe8e8f, 0x7000000170017171,
5146 0x7111111271127272, 0xaaaaaaaaaaaaaaab};
5147
5148 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sub_ins1_exp_b);
5149 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sub_ins1_exp_h);
5150 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sub_ins1_exp_s);
5151 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sub_ins1_exp_d);
5152
5153 fn = &MacroAssembler::Sqsub;
5154
5155 unsigned ins1_sqsub_ins2_exp_b[] = {0x80, 0x7f, 0x7f, 0x80};
5156 unsigned ins1_sqsub_ins2_exp_h[] = {0x8000, 0x7fff, 0x7fff, 0x8000};
5157 unsigned ins1_sqsub_ins2_exp_s[] = {0x80000000, 0x7fffffff, 0x7fffffff, 0x80000000};
5158 uint64_t ins1_sqsub_ins2_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5159 0x7fffffffffffffff, 0x8000000000000000};
5160
5161 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sqsub_ins2_exp_b);
5162 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sqsub_ins2_exp_h);
5163 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sqsub_ins2_exp_s);
5164 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sqsub_ins2_exp_d);
5165
5166 unsigned ins2_sqsub_ins1_exp_b[] = {0x7f, 0x80, 0x80, 0x7f};
5167 unsigned ins2_sqsub_ins1_exp_h[] = {0x7fff, 0x8000, 0x8000, 0x7fff};
5168 unsigned ins2_sqsub_ins1_exp_s[] = {0x7fffffff, 0x80000000, 0x80000000, 0x7fffffff};
5169 uint64_t ins2_sqsub_ins1_exp_d[] = {0x7fffffffffffffff, 0x8000000000000000,
5170 0x8000000000000000, 0x7fffffffffffffff};
5171
5172 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sqsub_ins1_exp_b);
5173 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sqsub_ins1_exp_h);
5174 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sqsub_ins1_exp_s);
5175 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sqsub_ins1_exp_d);
5176
5177 fn = &MacroAssembler::Uqsub;
5178
5179 unsigned ins1_uqsub_ins2_exp_b[] = {0x71, 0x00, 0x00, 0x55};
5180 unsigned ins1_uqsub_ins2_exp_h[] = {0x7171, 0x0000, 0x0000, 0x5555};
5181 unsigned ins1_uqsub_ins2_exp_s[] = {0x70017171, 0x00000000, 0x00000000, 0x55555555};
5182 uint64_t ins1_uqsub_ins2_exp_d[] = {0x7000000170017171, 0x0000000000000000,
5183 0x0000000000000000, 0x5555555555555555};
5184
5185 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_uqsub_ins2_exp_b);
5186 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_uqsub_ins2_exp_h);
5187 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_uqsub_ins2_exp_s);
5188 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_uqsub_ins2_exp_d);
5189
5190 unsigned ins2_uqsub_ins1_exp_b[] = {0x00, 0x71, 0x72, 0x00};
5191 unsigned ins2_uqsub_ins1_exp_h[] = {0x0000, 0x7171, 0x7272, 0x0000};
5192 unsigned ins2_uqsub_ins1_exp_s[] = {0x00000000, 0x70017171, 0x71127272, 0x00000000};
5193 uint64_t ins2_uqsub_ins1_exp_d[] = {0x0000000000000000, 0x7000000170017171,
5194 0x7111111271127272, 0x0000000000000000};
5195
5196 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_uqsub_ins1_exp_b);
5197 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_uqsub_ins1_exp_h);
5198 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_uqsub_ins1_exp_s);
5199 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_uqsub_ins1_exp_d);
5200 // clang-format on
5201}
5202
Jacob Bramley9e5da2a2019-08-06 18:52:07 +01005203TEST_SVE(sve_rdvl) {
5204 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5205 START();
5206
5207 // Encodable multipliers.
5208 __ Rdvl(x0, 0);
5209 __ Rdvl(x1, 1);
5210 __ Rdvl(x2, 2);
5211 __ Rdvl(x3, 31);
5212 __ Rdvl(x4, -1);
5213 __ Rdvl(x5, -2);
5214 __ Rdvl(x6, -32);
5215
5216 // For unencodable multipliers, the MacroAssembler uses a sequence of
5217 // instructions.
5218 __ Rdvl(x10, 32);
5219 __ Rdvl(x11, -33);
5220 __ Rdvl(x12, 42);
5221 __ Rdvl(x13, -42);
5222
5223 // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5224 // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5225 // occurs in the macro.
5226 __ Rdvl(x14, 0x007fffffffffffff);
5227 __ Rdvl(x15, -0x0080000000000000);
5228
5229 END();
5230
5231 if (CAN_RUN()) {
5232 RUN();
5233
5234 uint64_t vl = config->sve_vl_in_bytes();
5235
5236 ASSERT_EQUAL_64(vl * 0, x0);
5237 ASSERT_EQUAL_64(vl * 1, x1);
5238 ASSERT_EQUAL_64(vl * 2, x2);
5239 ASSERT_EQUAL_64(vl * 31, x3);
5240 ASSERT_EQUAL_64(vl * -1, x4);
5241 ASSERT_EQUAL_64(vl * -2, x5);
5242 ASSERT_EQUAL_64(vl * -32, x6);
5243
5244 ASSERT_EQUAL_64(vl * 32, x10);
5245 ASSERT_EQUAL_64(vl * -33, x11);
5246 ASSERT_EQUAL_64(vl * 42, x12);
5247 ASSERT_EQUAL_64(vl * -42, x13);
5248
5249 ASSERT_EQUAL_64(vl * 0x007fffffffffffff, x14);
5250 ASSERT_EQUAL_64(vl * 0xff80000000000000, x15);
5251 }
5252}
5253
5254TEST_SVE(sve_rdpl) {
5255 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5256 START();
5257
5258 // There is no `rdpl` instruction, so the MacroAssembler maps `Rdpl` onto
5259 // Addpl(xd, xzr, ...).
5260
5261 // Encodable multipliers (as `addvl`).
5262 __ Rdpl(x0, 0);
5263 __ Rdpl(x1, 8);
5264 __ Rdpl(x2, 248);
5265 __ Rdpl(x3, -8);
5266 __ Rdpl(x4, -256);
5267
5268 // Encodable multipliers (as `movz` + `addpl`).
5269 __ Rdpl(x7, 31);
Jacob Bramley889984c2019-10-28 17:28:48 +00005270 __ Rdpl(x8, -31);
Jacob Bramley9e5da2a2019-08-06 18:52:07 +01005271
5272 // For unencodable multipliers, the MacroAssembler uses a sequence of
5273 // instructions.
5274 __ Rdpl(x10, 42);
5275 __ Rdpl(x11, -42);
5276
5277 // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5278 // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5279 // occurs in the macro.
5280 __ Rdpl(x12, 0x007fffffffffffff);
5281 __ Rdpl(x13, -0x0080000000000000);
5282
5283 END();
5284
5285 if (CAN_RUN()) {
5286 RUN();
5287
5288 uint64_t vl = config->sve_vl_in_bytes();
5289 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5290 uint64_t pl = vl / kZRegBitsPerPRegBit;
5291
5292 ASSERT_EQUAL_64(pl * 0, x0);
5293 ASSERT_EQUAL_64(pl * 8, x1);
5294 ASSERT_EQUAL_64(pl * 248, x2);
5295 ASSERT_EQUAL_64(pl * -8, x3);
5296 ASSERT_EQUAL_64(pl * -256, x4);
5297
5298 ASSERT_EQUAL_64(pl * 31, x7);
Jacob Bramley889984c2019-10-28 17:28:48 +00005299 ASSERT_EQUAL_64(pl * -31, x8);
Jacob Bramley9e5da2a2019-08-06 18:52:07 +01005300
5301 ASSERT_EQUAL_64(pl * 42, x10);
5302 ASSERT_EQUAL_64(pl * -42, x11);
5303
5304 ASSERT_EQUAL_64(pl * 0x007fffffffffffff, x12);
5305 ASSERT_EQUAL_64(pl * 0xff80000000000000, x13);
5306 }
5307}
5308
5309TEST_SVE(sve_addvl) {
5310 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5311 START();
5312
5313 uint64_t base = 0x1234567800000000;
5314 __ Mov(x30, base);
5315
5316 // Encodable multipliers.
5317 __ Addvl(x0, x30, 0);
5318 __ Addvl(x1, x30, 1);
5319 __ Addvl(x2, x30, 31);
5320 __ Addvl(x3, x30, -1);
5321 __ Addvl(x4, x30, -32);
5322
5323 // For unencodable multipliers, the MacroAssembler uses `Rdvl` and `Add`.
5324 __ Addvl(x5, x30, 32);
5325 __ Addvl(x6, x30, -33);
5326
5327 // Test the limits of the multiplier supported by the `Rdvl` macro.
5328 __ Addvl(x7, x30, 0x007fffffffffffff);
5329 __ Addvl(x8, x30, -0x0080000000000000);
5330
5331 // Check that xzr behaves correctly.
5332 __ Addvl(x9, xzr, 8);
5333 __ Addvl(x10, xzr, 42);
5334
5335 // Check that sp behaves correctly with encodable and unencodable multipliers.
5336 __ Addvl(sp, sp, -5);
5337 __ Addvl(sp, sp, -37);
5338 __ Addvl(x11, sp, -2);
5339 __ Addvl(sp, x11, 2);
5340 __ Addvl(x12, sp, -42);
5341
5342 // Restore the value of sp.
5343 __ Addvl(sp, x11, 39);
5344 __ Addvl(sp, sp, 5);
5345
5346 // Adjust x11 and x12 to make the test sp-agnostic.
5347 __ Sub(x11, sp, x11);
5348 __ Sub(x12, sp, x12);
5349
5350 // Check cases where xd.Is(xn). This stresses scratch register allocation.
5351 __ Mov(x20, x30);
5352 __ Mov(x21, x30);
5353 __ Mov(x22, x30);
5354 __ Addvl(x20, x20, 4);
5355 __ Addvl(x21, x21, 42);
5356 __ Addvl(x22, x22, -0x0080000000000000);
5357
5358 END();
5359
5360 if (CAN_RUN()) {
5361 RUN();
5362
5363 uint64_t vl = config->sve_vl_in_bytes();
5364
5365 ASSERT_EQUAL_64(base + (vl * 0), x0);
5366 ASSERT_EQUAL_64(base + (vl * 1), x1);
5367 ASSERT_EQUAL_64(base + (vl * 31), x2);
5368 ASSERT_EQUAL_64(base + (vl * -1), x3);
5369 ASSERT_EQUAL_64(base + (vl * -32), x4);
5370
5371 ASSERT_EQUAL_64(base + (vl * 32), x5);
5372 ASSERT_EQUAL_64(base + (vl * -33), x6);
5373
5374 ASSERT_EQUAL_64(base + (vl * 0x007fffffffffffff), x7);
5375 ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x8);
5376
5377 ASSERT_EQUAL_64(vl * 8, x9);
5378 ASSERT_EQUAL_64(vl * 42, x10);
5379
5380 ASSERT_EQUAL_64(vl * 44, x11);
5381 ASSERT_EQUAL_64(vl * 84, x12);
5382
5383 ASSERT_EQUAL_64(base + (vl * 4), x20);
5384 ASSERT_EQUAL_64(base + (vl * 42), x21);
5385 ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x22);
5386
5387 ASSERT_EQUAL_64(base, x30);
5388 }
5389}
5390
5391TEST_SVE(sve_addpl) {
5392 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5393 START();
5394
5395 uint64_t base = 0x1234567800000000;
5396 __ Mov(x30, base);
5397
5398 // Encodable multipliers.
5399 __ Addpl(x0, x30, 0);
5400 __ Addpl(x1, x30, 1);
5401 __ Addpl(x2, x30, 31);
5402 __ Addpl(x3, x30, -1);
5403 __ Addpl(x4, x30, -32);
5404
5405 // For unencodable multipliers, the MacroAssembler uses `Addvl` if it can, or
5406 // it falls back to `Rdvl` and `Add`.
5407 __ Addpl(x5, x30, 32);
5408 __ Addpl(x6, x30, -33);
5409
5410 // Test the limits of the multiplier supported by the `Rdvl` macro.
5411 __ Addpl(x7, x30, 0x007fffffffffffff);
5412 __ Addpl(x8, x30, -0x0080000000000000);
5413
5414 // Check that xzr behaves correctly.
5415 __ Addpl(x9, xzr, 8);
5416 __ Addpl(x10, xzr, 42);
5417
5418 // Check that sp behaves correctly with encodable and unencodable multipliers.
5419 __ Addpl(sp, sp, -5);
5420 __ Addpl(sp, sp, -37);
5421 __ Addpl(x11, sp, -2);
5422 __ Addpl(sp, x11, 2);
5423 __ Addpl(x12, sp, -42);
5424
5425 // Restore the value of sp.
5426 __ Addpl(sp, x11, 39);
5427 __ Addpl(sp, sp, 5);
5428
5429 // Adjust x11 and x12 to make the test sp-agnostic.
5430 __ Sub(x11, sp, x11);
5431 __ Sub(x12, sp, x12);
5432
5433 // Check cases where xd.Is(xn). This stresses scratch register allocation.
5434 __ Mov(x20, x30);
5435 __ Mov(x21, x30);
5436 __ Mov(x22, x30);
5437 __ Addpl(x20, x20, 4);
5438 __ Addpl(x21, x21, 42);
5439 __ Addpl(x22, x22, -0x0080000000000000);
5440
5441 END();
5442
5443 if (CAN_RUN()) {
5444 RUN();
5445
5446 uint64_t vl = config->sve_vl_in_bytes();
5447 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5448 uint64_t pl = vl / kZRegBitsPerPRegBit;
5449
5450 ASSERT_EQUAL_64(base + (pl * 0), x0);
5451 ASSERT_EQUAL_64(base + (pl * 1), x1);
5452 ASSERT_EQUAL_64(base + (pl * 31), x2);
5453 ASSERT_EQUAL_64(base + (pl * -1), x3);
5454 ASSERT_EQUAL_64(base + (pl * -32), x4);
5455
5456 ASSERT_EQUAL_64(base + (pl * 32), x5);
5457 ASSERT_EQUAL_64(base + (pl * -33), x6);
5458
5459 ASSERT_EQUAL_64(base + (pl * 0x007fffffffffffff), x7);
5460 ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x8);
5461
5462 ASSERT_EQUAL_64(pl * 8, x9);
5463 ASSERT_EQUAL_64(pl * 42, x10);
5464
5465 ASSERT_EQUAL_64(pl * 44, x11);
5466 ASSERT_EQUAL_64(pl * 84, x12);
5467
5468 ASSERT_EQUAL_64(base + (pl * 4), x20);
5469 ASSERT_EQUAL_64(base + (pl * 42), x21);
5470 ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x22);
5471
5472 ASSERT_EQUAL_64(base, x30);
5473 }
5474}
5475
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005476TEST_SVE(sve_calculate_sve_address) {
5477 // Shadow the `MacroAssembler` type so that the test macros work without
5478 // modification.
5479 typedef CalculateSVEAddressMacroAssembler MacroAssembler;
5480
Jacob Bramley1314c462019-08-08 10:54:16 +01005481 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005482 START(); // NOLINT(clang-diagnostic-local-type-template-args)
Jacob Bramley1314c462019-08-08 10:54:16 +01005483
5484 uint64_t base = 0x1234567800000000;
5485 __ Mov(x28, base);
5486 __ Mov(x29, 48);
5487 __ Mov(x30, -48);
5488
5489 // Simple scalar (or equivalent) cases.
5490
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005491 __ CalculateSVEAddress(x0, SVEMemOperand(x28));
5492 __ CalculateSVEAddress(x1, SVEMemOperand(x28, 0));
5493 __ CalculateSVEAddress(x2, SVEMemOperand(x28, 0, SVE_MUL_VL));
5494 __ CalculateSVEAddress(x3, SVEMemOperand(x28, 0, SVE_MUL_VL), 3);
5495 __ CalculateSVEAddress(x4, SVEMemOperand(x28, xzr));
5496 __ CalculateSVEAddress(x5, SVEMemOperand(x28, xzr, LSL, 42));
Jacob Bramley1314c462019-08-08 10:54:16 +01005497
5498 // scalar-plus-immediate
5499
5500 // Unscaled immediates, handled with `Add`.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005501 __ CalculateSVEAddress(x6, SVEMemOperand(x28, 42));
5502 __ CalculateSVEAddress(x7, SVEMemOperand(x28, -42));
Jacob Bramley1314c462019-08-08 10:54:16 +01005503 // Scaled immediates, handled with `Addvl` or `Addpl`.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005504 __ CalculateSVEAddress(x8, SVEMemOperand(x28, 31, SVE_MUL_VL), 0);
5505 __ CalculateSVEAddress(x9, SVEMemOperand(x28, -32, SVE_MUL_VL), 0);
Jacob Bramley1314c462019-08-08 10:54:16 +01005506 // Out of `addvl` or `addpl` range.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005507 __ CalculateSVEAddress(x10, SVEMemOperand(x28, 42, SVE_MUL_VL), 0);
5508 __ CalculateSVEAddress(x11, SVEMemOperand(x28, -42, SVE_MUL_VL), 0);
5509 // As above, for VL-based accesses smaller than a Z register.
5510 VIXL_STATIC_ASSERT(kZRegBitsPerPRegBitLog2 == 3);
5511 __ CalculateSVEAddress(x12, SVEMemOperand(x28, -32 * 8, SVE_MUL_VL), 3);
5512 __ CalculateSVEAddress(x13, SVEMemOperand(x28, -42 * 8, SVE_MUL_VL), 3);
5513 __ CalculateSVEAddress(x14, SVEMemOperand(x28, -32 * 4, SVE_MUL_VL), 2);
5514 __ CalculateSVEAddress(x15, SVEMemOperand(x28, -42 * 4, SVE_MUL_VL), 2);
5515 __ CalculateSVEAddress(x18, SVEMemOperand(x28, -32 * 2, SVE_MUL_VL), 1);
5516 __ CalculateSVEAddress(x19, SVEMemOperand(x28, -42 * 2, SVE_MUL_VL), 1);
Jacob Bramley1314c462019-08-08 10:54:16 +01005517
5518 // scalar-plus-scalar
5519
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005520 __ CalculateSVEAddress(x20, SVEMemOperand(x28, x29));
5521 __ CalculateSVEAddress(x21, SVEMemOperand(x28, x30));
5522 __ CalculateSVEAddress(x22, SVEMemOperand(x28, x29, LSL, 8));
5523 __ CalculateSVEAddress(x23, SVEMemOperand(x28, x30, LSL, 8));
Jacob Bramley1314c462019-08-08 10:54:16 +01005524
5525 // In-place updates, to stress scratch register allocation.
5526
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005527 __ Mov(x24, 0xabcd000000000000);
5528 __ Mov(x25, 0xabcd101100000000);
5529 __ Mov(x26, 0xabcd202200000000);
5530 __ Mov(x27, 0xabcd303300000000);
5531 __ Mov(x28, 0xabcd404400000000);
5532 __ Mov(x29, 0xabcd505500000000);
Jacob Bramley1314c462019-08-08 10:54:16 +01005533
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005534 __ CalculateSVEAddress(x24, SVEMemOperand(x24));
5535 __ CalculateSVEAddress(x25, SVEMemOperand(x25, 0x42));
5536 __ CalculateSVEAddress(x26, SVEMemOperand(x26, 3, SVE_MUL_VL), 0);
5537 __ CalculateSVEAddress(x27, SVEMemOperand(x27, 0x42, SVE_MUL_VL), 3);
5538 __ CalculateSVEAddress(x28, SVEMemOperand(x28, x30));
5539 __ CalculateSVEAddress(x29, SVEMemOperand(x29, x30, LSL, 4));
Jacob Bramley1314c462019-08-08 10:54:16 +01005540
5541 END();
5542
5543 if (CAN_RUN()) {
5544 RUN();
5545
5546 uint64_t vl = config->sve_vl_in_bytes();
5547 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5548 uint64_t pl = vl / kZRegBitsPerPRegBit;
5549
5550 // Simple scalar (or equivalent) cases.
5551 ASSERT_EQUAL_64(base, x0);
5552 ASSERT_EQUAL_64(base, x1);
5553 ASSERT_EQUAL_64(base, x2);
5554 ASSERT_EQUAL_64(base, x3);
5555 ASSERT_EQUAL_64(base, x4);
5556 ASSERT_EQUAL_64(base, x5);
5557
5558 // scalar-plus-immediate
5559 ASSERT_EQUAL_64(base + 42, x6);
5560 ASSERT_EQUAL_64(base - 42, x7);
5561 ASSERT_EQUAL_64(base + (31 * vl), x8);
5562 ASSERT_EQUAL_64(base - (32 * vl), x9);
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005563 ASSERT_EQUAL_64(base + (42 * vl), x10);
5564 ASSERT_EQUAL_64(base - (42 * vl), x11);
5565 ASSERT_EQUAL_64(base - (32 * vl), x12);
Jacob Bramley1314c462019-08-08 10:54:16 +01005566 ASSERT_EQUAL_64(base - (42 * vl), x13);
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005567 ASSERT_EQUAL_64(base - (32 * vl), x14);
5568 ASSERT_EQUAL_64(base - (42 * vl), x15);
5569 ASSERT_EQUAL_64(base - (32 * vl), x18);
5570 ASSERT_EQUAL_64(base - (42 * vl), x19);
Jacob Bramley1314c462019-08-08 10:54:16 +01005571
5572 // scalar-plus-scalar
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005573 ASSERT_EQUAL_64(base + 48, x20);
5574 ASSERT_EQUAL_64(base - 48, x21);
5575 ASSERT_EQUAL_64(base + (48 << 8), x22);
5576 ASSERT_EQUAL_64(base - (48 << 8), x23);
Jacob Bramley1314c462019-08-08 10:54:16 +01005577
5578 // In-place updates.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005579 ASSERT_EQUAL_64(0xabcd000000000000, x24);
5580 ASSERT_EQUAL_64(0xabcd101100000000 + 0x42, x25);
5581 ASSERT_EQUAL_64(0xabcd202200000000 + (3 * vl), x26);
5582 ASSERT_EQUAL_64(0xabcd303300000000 + (0x42 * pl), x27);
5583 ASSERT_EQUAL_64(0xabcd404400000000 - 48, x28);
5584 ASSERT_EQUAL_64(0xabcd505500000000 - (48 << 4), x29);
Jacob Bramley1314c462019-08-08 10:54:16 +01005585 }
5586}
5587
TatWai Chong4f28df72019-08-14 17:50:30 -07005588TEST_SVE(sve_permute_vector_unpredicated) {
5589 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
5590 START();
5591
Jacob Bramleye4983d42019-10-08 10:56:15 +01005592 // Initialise registers with known values first.
5593 __ Dup(z1.VnB(), 0x11);
5594 __ Dup(z2.VnB(), 0x22);
5595 __ Dup(z3.VnB(), 0x33);
5596 __ Dup(z4.VnB(), 0x44);
5597
TatWai Chong4f28df72019-08-14 17:50:30 -07005598 __ Mov(x0, 0x0123456789abcdef);
5599 __ Fmov(d0, RawbitsToDouble(0x7ffaaaaa22223456));
5600 __ Insr(z1.VnS(), w0);
5601 __ Insr(z2.VnD(), x0);
5602 __ Insr(z3.VnH(), h0);
5603 __ Insr(z4.VnD(), d0);
5604
5605 uint64_t inputs[] = {0xfedcba9876543210,
5606 0x0123456789abcdef,
5607 0x8f8e8d8c8b8a8988,
5608 0x8786858483828180};
5609
5610 // Initialize a distinguishable value throughout the register first.
5611 __ Dup(z9.VnB(), 0xff);
5612 InsrHelper(&masm, z9.VnD(), inputs);
5613
5614 __ Rev(z5.VnB(), z9.VnB());
5615 __ Rev(z6.VnH(), z9.VnH());
5616 __ Rev(z7.VnS(), z9.VnS());
5617 __ Rev(z8.VnD(), z9.VnD());
5618
5619 int index[7] = {22, 7, 7, 3, 1, 1, 63};
5620 // Broadcasting an data within the input array.
5621 __ Dup(z10.VnB(), z9.VnB(), index[0]);
5622 __ Dup(z11.VnH(), z9.VnH(), index[1]);
5623 __ Dup(z12.VnS(), z9.VnS(), index[2]);
5624 __ Dup(z13.VnD(), z9.VnD(), index[3]);
5625 __ Dup(z14.VnQ(), z9.VnQ(), index[4]);
5626 // Test dst == src
5627 __ Mov(z15, z9);
5628 __ Dup(z15.VnS(), z15.VnS(), index[5]);
5629 // Selecting an data beyond the input array.
5630 __ Dup(z16.VnB(), z9.VnB(), index[6]);
5631
5632 END();
5633
5634 if (CAN_RUN()) {
5635 RUN();
5636
5637 // Insr
Jacob Bramleye4983d42019-10-08 10:56:15 +01005638 uint64_t z1_expected[] = {0x1111111111111111, 0x1111111189abcdef};
5639 uint64_t z2_expected[] = {0x2222222222222222, 0x0123456789abcdef};
5640 uint64_t z3_expected[] = {0x3333333333333333, 0x3333333333333456};
5641 uint64_t z4_expected[] = {0x4444444444444444, 0x7ffaaaaa22223456};
TatWai Chong4f28df72019-08-14 17:50:30 -07005642 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
5643 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
5644 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
5645 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
5646
5647 // Rev
5648 int lane_count = core.GetSVELaneCount(kBRegSize);
5649 for (int i = 0; i < lane_count; i++) {
5650 uint64_t expected =
5651 core.zreg_lane(z5.GetCode(), kBRegSize, lane_count - i - 1);
5652 uint64_t input = core.zreg_lane(z9.GetCode(), kBRegSize, i);
5653 ASSERT_EQUAL_64(expected, input);
5654 }
5655
5656 lane_count = core.GetSVELaneCount(kHRegSize);
5657 for (int i = 0; i < lane_count; i++) {
5658 uint64_t expected =
5659 core.zreg_lane(z6.GetCode(), kHRegSize, lane_count - i - 1);
5660 uint64_t input = core.zreg_lane(z9.GetCode(), kHRegSize, i);
5661 ASSERT_EQUAL_64(expected, input);
5662 }
5663
5664 lane_count = core.GetSVELaneCount(kSRegSize);
5665 for (int i = 0; i < lane_count; i++) {
5666 uint64_t expected =
5667 core.zreg_lane(z7.GetCode(), kSRegSize, lane_count - i - 1);
5668 uint64_t input = core.zreg_lane(z9.GetCode(), kSRegSize, i);
5669 ASSERT_EQUAL_64(expected, input);
5670 }
5671
5672 lane_count = core.GetSVELaneCount(kDRegSize);
5673 for (int i = 0; i < lane_count; i++) {
5674 uint64_t expected =
5675 core.zreg_lane(z8.GetCode(), kDRegSize, lane_count - i - 1);
5676 uint64_t input = core.zreg_lane(z9.GetCode(), kDRegSize, i);
5677 ASSERT_EQUAL_64(expected, input);
5678 }
5679
5680 // Dup
5681 unsigned vl = config->sve_vl_in_bits();
5682 lane_count = core.GetSVELaneCount(kBRegSize);
5683 uint64_t expected_z10 = (vl > (index[0] * kBRegSize)) ? 0x23 : 0;
5684 for (int i = 0; i < lane_count; i++) {
5685 ASSERT_EQUAL_SVE_LANE(expected_z10, z10.VnB(), i);
5686 }
5687
5688 lane_count = core.GetSVELaneCount(kHRegSize);
5689 uint64_t expected_z11 = (vl > (index[1] * kHRegSize)) ? 0x8f8e : 0;
5690 for (int i = 0; i < lane_count; i++) {
5691 ASSERT_EQUAL_SVE_LANE(expected_z11, z11.VnH(), i);
5692 }
5693
5694 lane_count = core.GetSVELaneCount(kSRegSize);
5695 uint64_t expected_z12 = (vl > (index[2] * kSRegSize)) ? 0xfedcba98 : 0;
5696 for (int i = 0; i < lane_count; i++) {
5697 ASSERT_EQUAL_SVE_LANE(expected_z12, z12.VnS(), i);
5698 }
5699
5700 lane_count = core.GetSVELaneCount(kDRegSize);
5701 uint64_t expected_z13 =
5702 (vl > (index[3] * kDRegSize)) ? 0xfedcba9876543210 : 0;
5703 for (int i = 0; i < lane_count; i++) {
5704 ASSERT_EQUAL_SVE_LANE(expected_z13, z13.VnD(), i);
5705 }
5706
5707 lane_count = core.GetSVELaneCount(kDRegSize);
5708 uint64_t expected_z14_lo = 0;
5709 uint64_t expected_z14_hi = 0;
5710 if (vl > (index[4] * kQRegSize)) {
5711 expected_z14_lo = 0x0123456789abcdef;
5712 expected_z14_hi = 0xfedcba9876543210;
5713 }
5714 for (int i = 0; i < lane_count; i += 2) {
5715 ASSERT_EQUAL_SVE_LANE(expected_z14_lo, z14.VnD(), i);
5716 ASSERT_EQUAL_SVE_LANE(expected_z14_hi, z14.VnD(), i + 1);
5717 }
5718
5719 lane_count = core.GetSVELaneCount(kSRegSize);
5720 uint64_t expected_z15 = (vl > (index[5] * kSRegSize)) ? 0x87868584 : 0;
5721 for (int i = 0; i < lane_count; i++) {
5722 ASSERT_EQUAL_SVE_LANE(expected_z15, z15.VnS(), i);
5723 }
5724
5725 lane_count = core.GetSVELaneCount(kBRegSize);
5726 uint64_t expected_z16 = (vl > (index[6] * kBRegSize)) ? 0xff : 0;
5727 for (int i = 0; i < lane_count; i++) {
5728 ASSERT_EQUAL_SVE_LANE(expected_z16, z16.VnB(), i);
5729 }
5730 }
5731}
5732
Martyn Capewell2e954292020-01-14 14:56:42 +00005733TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements) {
TatWai Chong4f28df72019-08-14 17:50:30 -07005734 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5735 START();
5736
5737 uint64_t z9_inputs[] = {0xfedcba9876543210,
5738 0x0123456789abcdef,
5739 0x8f8e8d8c8b8a8988,
5740 0x8786858483828180};
5741 InsrHelper(&masm, z9.VnD(), z9_inputs);
5742
5743 __ Sunpkhi(z10.VnH(), z9.VnB());
5744 __ Sunpkhi(z11.VnS(), z9.VnH());
5745 __ Sunpkhi(z12.VnD(), z9.VnS());
5746
5747 __ Sunpklo(z13.VnH(), z9.VnB());
5748 __ Sunpklo(z14.VnS(), z9.VnH());
5749 __ Sunpklo(z15.VnD(), z9.VnS());
5750
5751 __ Uunpkhi(z16.VnH(), z9.VnB());
5752 __ Uunpkhi(z17.VnS(), z9.VnH());
5753 __ Uunpkhi(z18.VnD(), z9.VnS());
5754
5755 __ Uunpklo(z19.VnH(), z9.VnB());
5756 __ Uunpklo(z20.VnS(), z9.VnH());
5757 __ Uunpklo(z21.VnD(), z9.VnS());
5758
Martyn Capewell2e954292020-01-14 14:56:42 +00005759 // Test unpacking with same source and destination.
5760 __ Mov(z22, z9);
5761 __ Sunpklo(z22.VnH(), z22.VnB());
5762 __ Mov(z23, z9);
5763 __ Uunpklo(z23.VnH(), z23.VnB());
5764
TatWai Chong4f28df72019-08-14 17:50:30 -07005765 END();
5766
5767 if (CAN_RUN()) {
5768 RUN();
5769
5770 // Suunpkhi
5771 int lane_count = core.GetSVELaneCount(kHRegSize);
5772 for (int i = lane_count - 1; i >= 0; i--) {
5773 uint16_t expected = core.zreg_lane<uint16_t>(z10.GetCode(), i);
5774 uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5775 uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5776 ASSERT_EQUAL_64(expected, input);
5777 }
5778
5779 lane_count = core.GetSVELaneCount(kSRegSize);
5780 for (int i = lane_count - 1; i >= 0; i--) {
5781 uint32_t expected = core.zreg_lane<uint32_t>(z11.GetCode(), i);
5782 uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5783 uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5784 ASSERT_EQUAL_64(expected, input);
5785 }
5786
5787 lane_count = core.GetSVELaneCount(kDRegSize);
5788 for (int i = lane_count - 1; i >= 0; i--) {
5789 uint64_t expected = core.zreg_lane<uint64_t>(z12.GetCode(), i);
5790 uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5791 uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5792 ASSERT_EQUAL_64(expected, input);
5793 }
5794
5795 // Suunpklo
5796 lane_count = core.GetSVELaneCount(kHRegSize);
5797 for (int i = lane_count - 1; i >= 0; i--) {
5798 uint16_t expected = core.zreg_lane<uint16_t>(z13.GetCode(), i);
5799 uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5800 uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5801 ASSERT_EQUAL_64(expected, input);
5802 }
5803
5804 lane_count = core.GetSVELaneCount(kSRegSize);
5805 for (int i = lane_count - 1; i >= 0; i--) {
5806 uint32_t expected = core.zreg_lane<uint32_t>(z14.GetCode(), i);
5807 uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5808 uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5809 ASSERT_EQUAL_64(expected, input);
5810 }
5811
5812 lane_count = core.GetSVELaneCount(kDRegSize);
5813 for (int i = lane_count - 1; i >= 0; i--) {
5814 uint64_t expected = core.zreg_lane<uint64_t>(z15.GetCode(), i);
5815 uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5816 uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5817 ASSERT_EQUAL_64(expected, input);
5818 }
5819
5820 // Uuunpkhi
5821 lane_count = core.GetSVELaneCount(kHRegSize);
5822 for (int i = lane_count - 1; i >= 0; i--) {
5823 uint16_t expected = core.zreg_lane<uint16_t>(z16.GetCode(), i);
5824 uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5825 ASSERT_EQUAL_64(expected, input);
5826 }
5827
5828 lane_count = core.GetSVELaneCount(kSRegSize);
5829 for (int i = lane_count - 1; i >= 0; i--) {
5830 uint32_t expected = core.zreg_lane<uint32_t>(z17.GetCode(), i);
5831 uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5832 ASSERT_EQUAL_64(expected, input);
5833 }
5834
5835 lane_count = core.GetSVELaneCount(kDRegSize);
5836 for (int i = lane_count - 1; i >= 0; i--) {
5837 uint64_t expected = core.zreg_lane<uint64_t>(z18.GetCode(), i);
5838 uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5839 ASSERT_EQUAL_64(expected, input);
5840 }
5841
5842 // Uuunpklo
5843 lane_count = core.GetSVELaneCount(kHRegSize);
5844 for (int i = lane_count - 1; i >= 0; i--) {
5845 uint16_t expected = core.zreg_lane<uint16_t>(z19.GetCode(), i);
5846 uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5847 ASSERT_EQUAL_64(expected, input);
5848 }
5849
5850 lane_count = core.GetSVELaneCount(kSRegSize);
5851 for (int i = lane_count - 1; i >= 0; i--) {
5852 uint32_t expected = core.zreg_lane<uint32_t>(z20.GetCode(), i);
5853 uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5854 ASSERT_EQUAL_64(expected, input);
5855 }
5856
5857 lane_count = core.GetSVELaneCount(kDRegSize);
5858 for (int i = lane_count - 1; i >= 0; i--) {
5859 uint64_t expected = core.zreg_lane<uint64_t>(z21.GetCode(), i);
5860 uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5861 ASSERT_EQUAL_64(expected, input);
5862 }
Martyn Capewell2e954292020-01-14 14:56:42 +00005863
5864 ASSERT_EQUAL_SVE(z13, z22);
5865 ASSERT_EQUAL_SVE(z19, z23);
TatWai Chong4f28df72019-08-14 17:50:30 -07005866 }
5867}
5868
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01005869TEST_SVE(sve_cnot_not) {
5870 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5871 START();
5872
5873 uint64_t in[] = {0x0000000000000000, 0x00000000e1c30000, 0x123456789abcdef0};
5874
5875 // For simplicity, we re-use the same pg for various lane sizes.
5876 // For D lanes: 1, 1, 0
5877 // For S lanes: 1, 1, 1, 0, 0
5878 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
5879 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
5880 Initialise(&masm, p0.VnB(), pg_in);
5881 PRegisterM pg = p0.Merging();
5882
5883 // These are merging operations, so we have to initialise the result register.
5884 // We use a mixture of constructive and destructive operations.
5885
5886 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01005887 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01005888 __ Mov(z30, z31);
5889
5890 // For constructive operations, use a different initial result value.
5891 __ Index(z29.VnB(), 0, -1);
5892
5893 __ Mov(z0, z31);
5894 __ Cnot(z0.VnB(), pg, z0.VnB()); // destructive
5895 __ Mov(z1, z29);
5896 __ Cnot(z1.VnH(), pg, z31.VnH());
5897 __ Mov(z2, z31);
5898 __ Cnot(z2.VnS(), pg, z2.VnS()); // destructive
5899 __ Mov(z3, z29);
5900 __ Cnot(z3.VnD(), pg, z31.VnD());
5901
5902 __ Mov(z4, z29);
5903 __ Not(z4.VnB(), pg, z31.VnB());
5904 __ Mov(z5, z31);
5905 __ Not(z5.VnH(), pg, z5.VnH()); // destructive
5906 __ Mov(z6, z29);
5907 __ Not(z6.VnS(), pg, z31.VnS());
5908 __ Mov(z7, z31);
5909 __ Not(z7.VnD(), pg, z7.VnD()); // destructive
5910
5911 END();
5912
5913 if (CAN_RUN()) {
5914 RUN();
5915
5916 // Check that constructive operations preserve their inputs.
5917 ASSERT_EQUAL_SVE(z30, z31);
5918
5919 // clang-format off
5920
5921 // Cnot (B) destructive
5922 uint64_t expected_z0[] =
5923 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
5924 {0x0000000001000101, 0x01000001e1000101, 0x12340078000000f0};
5925 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
5926
5927 // Cnot (H)
5928 uint64_t expected_z1[] =
5929 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
5930 {0xe9eaebecedee0001, 0xf1f2000100000001, 0xf9fafbfc0000ff00};
5931 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
5932
5933 // Cnot (S) destructive
5934 uint64_t expected_z2[] =
5935 // pg: 0 1 1 1 0 0
5936 {0x0000000000000001, 0x0000000100000000, 0x123456789abcdef0};
5937 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
5938
5939 // Cnot (D)
5940 uint64_t expected_z3[] =
5941 // pg: 1 1 0
5942 {0x0000000000000001, 0x0000000000000000, 0xf9fafbfcfdfeff00};
5943 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
5944
5945 // Not (B)
5946 uint64_t expected_z4[] =
5947 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
5948 {0xe9eaebecffeeffff, 0xfff2f3fff53cffff, 0xf9faa9fc65432100};
5949 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
5950
5951 // Not (H) destructive
5952 uint64_t expected_z5[] =
5953 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
5954 {0x000000000000ffff, 0x0000ffff1e3cffff, 0x123456786543def0};
5955 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
5956
5957 // Not (S)
5958 uint64_t expected_z6[] =
5959 // pg: 0 1 1 1 0 0
5960 {0xe9eaebecffffffff, 0xffffffff1e3cffff, 0xf9fafbfcfdfeff00};
5961 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
5962
5963 // Not (D) destructive
5964 uint64_t expected_z7[] =
5965 // pg: 1 1 0
5966 {0xffffffffffffffff, 0xffffffff1e3cffff, 0x123456789abcdef0};
5967 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
5968
5969 // clang-format on
5970 }
5971}
5972
5973TEST_SVE(sve_fabs_fneg) {
5974 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5975 START();
5976
5977 // Include FP64, FP32 and FP16 signalling NaNs. Most FP operations quieten
5978 // NaNs, but fabs and fneg do not.
5979 uint64_t in[] = {0xc04500004228d140, // Recognisable (+/-42) values.
5980 0xfff00000ff80fc01, // Signalling NaNs.
5981 0x123456789abcdef0};
5982
5983 // For simplicity, we re-use the same pg for various lane sizes.
5984 // For D lanes: 1, 1, 0
5985 // For S lanes: 1, 1, 1, 0, 0
5986 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
5987 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
5988 Initialise(&masm, p0.VnB(), pg_in);
5989 PRegisterM pg = p0.Merging();
5990
5991 // These are merging operations, so we have to initialise the result register.
5992 // We use a mixture of constructive and destructive operations.
5993
5994 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01005995 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01005996 __ Mov(z30, z31);
5997
5998 // For constructive operations, use a different initial result value.
5999 __ Index(z29.VnB(), 0, -1);
6000
6001 __ Mov(z0, z29);
6002 __ Fabs(z0.VnH(), pg, z31.VnH());
6003 __ Mov(z1, z31);
6004 __ Fabs(z1.VnS(), pg, z1.VnS()); // destructive
6005 __ Mov(z2, z29);
6006 __ Fabs(z2.VnD(), pg, z31.VnD());
6007
6008 __ Mov(z3, z31);
6009 __ Fneg(z3.VnH(), pg, z3.VnH()); // destructive
6010 __ Mov(z4, z29);
6011 __ Fneg(z4.VnS(), pg, z31.VnS());
6012 __ Mov(z5, z31);
6013 __ Fneg(z5.VnD(), pg, z5.VnD()); // destructive
6014
6015 END();
6016
6017 if (CAN_RUN()) {
6018 RUN();
6019
6020 // Check that constructive operations preserve their inputs.
6021 ASSERT_EQUAL_SVE(z30, z31);
6022
6023 // clang-format off
6024
6025 // Fabs (H)
6026 uint64_t expected_z0[] =
6027 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6028 {0xe9eaebecedee5140, 0xf1f200007f807c01, 0xf9fafbfc1abcff00};
6029 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6030
6031 // Fabs (S) destructive
6032 uint64_t expected_z1[] =
6033 // pg: 0 1 1 1 0 0
6034 {0xc04500004228d140, 0x7ff000007f80fc01, 0x123456789abcdef0};
6035 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6036
6037 // Fabs (D)
6038 uint64_t expected_z2[] =
6039 // pg: 1 1 0
6040 {0x404500004228d140, 0x7ff00000ff80fc01, 0xf9fafbfcfdfeff00};
6041 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6042
6043 // Fneg (H) destructive
6044 uint64_t expected_z3[] =
6045 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6046 {0xc045000042285140, 0xfff080007f807c01, 0x123456781abcdef0};
6047 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6048
6049 // Fneg (S)
6050 uint64_t expected_z4[] =
6051 // pg: 0 1 1 1 0 0
6052 {0xe9eaebecc228d140, 0x7ff000007f80fc01, 0xf9fafbfcfdfeff00};
6053 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6054
6055 // Fneg (D) destructive
6056 uint64_t expected_z5[] =
6057 // pg: 1 1 0
6058 {0x404500004228d140, 0x7ff00000ff80fc01, 0x123456789abcdef0};
6059 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6060
6061 // clang-format on
6062 }
6063}
6064
6065TEST_SVE(sve_cls_clz_cnt) {
6066 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6067 START();
6068
6069 uint64_t in[] = {0x0000000000000000, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6070
6071 // For simplicity, we re-use the same pg for various lane sizes.
6072 // For D lanes: 1, 1, 0
6073 // For S lanes: 1, 1, 1, 0, 0
6074 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6075 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6076 Initialise(&masm, p0.VnB(), pg_in);
6077 PRegisterM pg = p0.Merging();
6078
6079 // These are merging operations, so we have to initialise the result register.
6080 // We use a mixture of constructive and destructive operations.
6081
6082 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006083 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006084 __ Mov(z30, z31);
6085
6086 // For constructive operations, use a different initial result value.
6087 __ Index(z29.VnB(), 0, -1);
6088
6089 __ Mov(z0, z29);
6090 __ Cls(z0.VnB(), pg, z31.VnB());
6091 __ Mov(z1, z31);
6092 __ Clz(z1.VnH(), pg, z1.VnH()); // destructive
6093 __ Mov(z2, z29);
6094 __ Cnt(z2.VnS(), pg, z31.VnS());
6095 __ Mov(z3, z31);
6096 __ Cnt(z3.VnD(), pg, z3.VnD()); // destructive
6097
6098 END();
6099
6100 if (CAN_RUN()) {
6101 RUN();
6102 // Check that non-destructive operations preserve their inputs.
6103 ASSERT_EQUAL_SVE(z30, z31);
6104
6105 // clang-format off
6106
6107 // cls (B)
6108 uint8_t expected_z0[] =
6109 // pg: 0 0 0 0 1 0 1 1
6110 // pg: 1 0 0 1 0 1 1 1
6111 // pg: 0 0 1 0 1 1 1 0
6112 {0xe9, 0xea, 0xeb, 0xec, 7, 0xee, 7, 7,
6113 6, 0xf2, 0xf3, 3, 0xf5, 1, 0, 3,
6114 0xf9, 0xfa, 0, 0xfc, 0, 0, 1, 0x00};
6115 ASSERT_EQUAL_SVE(expected_z0, z0.VnB());
6116
6117 // clz (H) destructive
6118 uint16_t expected_z1[] =
6119 // pg: 0 0 0 1
6120 // pg: 0 1 1 1
6121 // pg: 0 0 1 0
6122 {0x0000, 0x0000, 0x0000, 16,
6123 0xfefc, 0, 0, 0,
6124 0x1234, 0x5678, 0, 0xdef0};
6125 ASSERT_EQUAL_SVE(expected_z1, z1.VnH());
6126
6127 // cnt (S)
6128 uint32_t expected_z2[] =
6129 // pg: 0 1
6130 // pg: 1 1
6131 // pg: 0 0
6132 {0xe9eaebec, 0,
6133 22, 16,
6134 0xf9fafbfc, 0xfdfeff00};
6135 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
6136
6137 // cnt (D) destructive
6138 uint64_t expected_z3[] =
6139 // pg: 1 1 0
6140 { 0, 38, 0x123456789abcdef0};
6141 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6142
6143 // clang-format on
6144 }
6145}
6146
6147TEST_SVE(sve_sxt) {
6148 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6149 START();
6150
6151 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6152
6153 // For simplicity, we re-use the same pg for various lane sizes.
6154 // For D lanes: 1, 1, 0
6155 // For S lanes: 1, 1, 1, 0, 0
6156 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6157 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6158 Initialise(&masm, p0.VnB(), pg_in);
6159 PRegisterM pg = p0.Merging();
6160
6161 // These are merging operations, so we have to initialise the result register.
6162 // We use a mixture of constructive and destructive operations.
6163
6164 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006165 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006166 __ Mov(z30, z31);
6167
6168 // For constructive operations, use a different initial result value.
6169 __ Index(z29.VnB(), 0, -1);
6170
6171 __ Mov(z0, z31);
6172 __ Sxtb(z0.VnH(), pg, z0.VnH()); // destructive
6173 __ Mov(z1, z29);
6174 __ Sxtb(z1.VnS(), pg, z31.VnS());
6175 __ Mov(z2, z31);
6176 __ Sxtb(z2.VnD(), pg, z2.VnD()); // destructive
6177 __ Mov(z3, z29);
6178 __ Sxth(z3.VnS(), pg, z31.VnS());
6179 __ Mov(z4, z31);
6180 __ Sxth(z4.VnD(), pg, z4.VnD()); // destructive
6181 __ Mov(z5, z29);
6182 __ Sxtw(z5.VnD(), pg, z31.VnD());
6183
6184 END();
6185
6186 if (CAN_RUN()) {
6187 RUN();
6188 // Check that constructive operations preserve their inputs.
6189 ASSERT_EQUAL_SVE(z30, z31);
6190
6191 // clang-format off
6192
6193 // Sxtb (H) destructive
6194 uint64_t expected_z0[] =
6195 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6196 {0x01f203f405f6fff8, 0xfefcfff0ffc3000f, 0x12345678ffbcdef0};
6197 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6198
6199 // Sxtb (S)
6200 uint64_t expected_z1[] =
6201 // pg: 0 1 1 1 0 0
6202 {0xe9eaebecfffffff8, 0xfffffff00000000f, 0xf9fafbfcfdfeff00};
6203 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6204
6205 // Sxtb (D) destructive
6206 uint64_t expected_z2[] =
6207 // pg: 1 1 0
6208 {0xfffffffffffffff8, 0x000000000000000f, 0x123456789abcdef0};
6209 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6210
6211 // Sxth (S)
6212 uint64_t expected_z3[] =
6213 // pg: 0 1 1 1 0 0
6214 {0xe9eaebec000007f8, 0xfffff8f0ffff870f, 0xf9fafbfcfdfeff00};
6215 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6216
6217 // Sxth (D) destructive
6218 uint64_t expected_z4[] =
6219 // pg: 1 1 0
6220 {0x00000000000007f8, 0xffffffffffff870f, 0x123456789abcdef0};
6221 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6222
6223 // Sxtw (D)
6224 uint64_t expected_z5[] =
6225 // pg: 1 1 0
6226 {0x0000000005f607f8, 0xffffffffe1c3870f, 0xf9fafbfcfdfeff00};
6227 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6228
6229 // clang-format on
6230 }
6231}
6232
6233TEST_SVE(sve_uxt) {
6234 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6235 START();
6236
6237 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6238
6239 // For simplicity, we re-use the same pg for various lane sizes.
6240 // For D lanes: 1, 1, 0
6241 // For S lanes: 1, 1, 1, 0, 0
6242 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6243 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6244 Initialise(&masm, p0.VnB(), pg_in);
6245 PRegisterM pg = p0.Merging();
6246
6247 // These are merging operations, so we have to initialise the result register.
6248 // We use a mixture of constructive and destructive operations.
6249
6250 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006251 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006252 __ Mov(z30, z31);
6253
6254 // For constructive operations, use a different initial result value.
6255 __ Index(z29.VnB(), 0, -1);
6256
6257 __ Mov(z0, z29);
6258 __ Uxtb(z0.VnH(), pg, z31.VnH());
6259 __ Mov(z1, z31);
6260 __ Uxtb(z1.VnS(), pg, z1.VnS()); // destructive
6261 __ Mov(z2, z29);
6262 __ Uxtb(z2.VnD(), pg, z31.VnD());
6263 __ Mov(z3, z31);
6264 __ Uxth(z3.VnS(), pg, z3.VnS()); // destructive
6265 __ Mov(z4, z29);
6266 __ Uxth(z4.VnD(), pg, z31.VnD());
6267 __ Mov(z5, z31);
6268 __ Uxtw(z5.VnD(), pg, z5.VnD()); // destructive
6269
6270 END();
6271
6272 if (CAN_RUN()) {
6273 RUN();
6274 // clang-format off
6275
6276 // Uxtb (H)
6277 uint64_t expected_z0[] =
6278 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6279 {0xe9eaebecedee00f8, 0xf1f200f000c3000f, 0xf9fafbfc00bcff00};
6280 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6281
6282 // Uxtb (S) destructive
6283 uint64_t expected_z1[] =
6284 // pg: 0 1 1 1 0 0
6285 {0x01f203f4000000f8, 0x000000f00000000f, 0x123456789abcdef0};
6286 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6287
6288 // Uxtb (D)
6289 uint64_t expected_z2[] =
6290 // pg: 1 1 0
6291 {0x00000000000000f8, 0x000000000000000f, 0xf9fafbfcfdfeff00};
6292 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6293
6294 // Uxth (S) destructive
6295 uint64_t expected_z3[] =
6296 // pg: 0 1 1 1 0 0
6297 {0x01f203f4000007f8, 0x0000f8f00000870f, 0x123456789abcdef0};
6298 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6299
6300 // Uxth (D)
6301 uint64_t expected_z4[] =
6302 // pg: 1 1 0
6303 {0x00000000000007f8, 0x000000000000870f, 0xf9fafbfcfdfeff00};
6304 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6305
6306 // Uxtw (D) destructive
6307 uint64_t expected_z5[] =
6308 // pg: 1 1 0
6309 {0x0000000005f607f8, 0x00000000e1c3870f, 0x123456789abcdef0};
6310 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6311
6312 // clang-format on
6313 }
6314}
6315
6316TEST_SVE(sve_abs_neg) {
6317 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6318 START();
6319
6320 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6321
6322 // For simplicity, we re-use the same pg for various lane sizes.
6323 // For D lanes: 1, 1, 0
6324 // For S lanes: 1, 1, 1, 0, 0
6325 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6326 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6327 Initialise(&masm, p0.VnB(), pg_in);
6328 PRegisterM pg = p0.Merging();
6329
6330 InsrHelper(&masm, z31.VnD(), in);
6331
6332 // These are merging operations, so we have to initialise the result register.
6333 // We use a mixture of constructive and destructive operations.
6334
6335 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006336 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006337 __ Mov(z30, z31);
6338
6339 // For constructive operations, use a different initial result value.
6340 __ Index(z29.VnB(), 0, -1);
6341
6342 __ Mov(z0, z31);
6343 __ Abs(z0.VnD(), pg, z0.VnD()); // destructive
6344 __ Mov(z1, z29);
6345 __ Abs(z1.VnB(), pg, z31.VnB());
6346
6347 __ Mov(z2, z31);
6348 __ Neg(z2.VnH(), pg, z2.VnH()); // destructive
6349 __ Mov(z3, z29);
6350 __ Neg(z3.VnS(), pg, z31.VnS());
6351
Jacob Bramleyc0066272019-09-30 16:30:47 +01006352 // The unpredicated form of `Neg` is implemented using `subr`.
6353 __ Mov(z4, z31);
6354 __ Neg(z4.VnB(), z4.VnB()); // destructive
6355 __ Mov(z5, z29);
6356 __ Neg(z5.VnD(), z31.VnD());
6357
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006358 END();
6359
6360 if (CAN_RUN()) {
6361 RUN();
Jacob Bramleyc0066272019-09-30 16:30:47 +01006362
6363 ASSERT_EQUAL_SVE(z30, z31);
6364
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006365 // clang-format off
6366
6367 // Abs (D) destructive
6368 uint64_t expected_z0[] =
6369 // pg: 1 1 0
6370 {0x01f203f405f607f8, 0x0103070f1e3c78f1, 0x123456789abcdef0};
6371 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6372
6373 // Abs (B)
6374 uint64_t expected_z1[] =
6375 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
6376 {0xe9eaebec05ee0708, 0x02f2f310f53d790f, 0xf9fa56fc66442200};
6377 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6378
6379 // Neg (H) destructive
6380 uint64_t expected_z2[] =
6381 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6382 {0x01f203f405f6f808, 0xfefc07101e3d78f1, 0x123456786544def0};
6383 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6384
6385 // Neg (S)
6386 uint64_t expected_z3[] =
6387 // pg: 0 1 1 1 0 0
6388 {0xe9eaebecfa09f808, 0x010307101e3c78f1, 0xf9fafbfcfdfeff00};
6389 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6390
Jacob Bramleyc0066272019-09-30 16:30:47 +01006391 // Neg (B) destructive, unpredicated
6392 uint64_t expected_z4[] =
6393 {0xff0efd0cfb0af908, 0x020408101f3d79f1, 0xeeccaa8866442210};
6394 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6395
6396 // Neg (D) unpredicated
6397 uint64_t expected_z5[] =
6398 {0xfe0dfc0bfa09f808, 0x0103070f1e3c78f1, 0xedcba98765432110};
6399 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6400
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006401 // clang-format on
6402 }
6403}
6404
Jacob Bramley0093bb92019-10-04 15:54:10 +01006405TEST_SVE(sve_cpy) {
6406 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
6407 START();
6408
6409 // For simplicity, we re-use the same pg for various lane sizes.
6410 // For D lanes: 0, 1, 1
6411 // For S lanes: 0, 1, 1, 0, 1
6412 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6413 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6414
6415 PRegisterM pg = p7.Merging();
6416 Initialise(&masm, pg.VnB(), pg_in);
6417
6418 // These are merging operations, so we have to initialise the result registers
6419 // for each operation.
6420 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6421 __ Index(ZRegister(i, kBRegSize), 0, -1);
6422 }
6423
6424 // Recognisable values to copy.
6425 __ Mov(x0, 0xdeadbeefdeadbe42);
6426 __ Mov(x1, 0xdeadbeefdead8421);
6427 __ Mov(x2, 0xdeadbeef80042001);
6428 __ Mov(x3, 0x8000000420000001);
6429
6430 // Use NEON moves, to avoid testing SVE `cpy` against itself.
6431 __ Dup(v28.V2D(), x0);
6432 __ Dup(v29.V2D(), x1);
6433 __ Dup(v30.V2D(), x2);
6434 __ Dup(v31.V2D(), x3);
6435
6436 // Register forms (CPY_z_p_r)
6437 __ Cpy(z0.VnB(), pg, w0);
6438 __ Cpy(z1.VnH(), pg, x1); // X registers are accepted for small lanes.
6439 __ Cpy(z2.VnS(), pg, w2);
6440 __ Cpy(z3.VnD(), pg, x3);
6441
6442 // VRegister forms (CPY_z_p_v)
6443 __ Cpy(z4.VnB(), pg, b28);
6444 __ Cpy(z5.VnH(), pg, h29);
6445 __ Cpy(z6.VnS(), pg, s30);
6446 __ Cpy(z7.VnD(), pg, d31);
6447
6448 // Check that we can copy the stack pointer.
6449 __ Mov(x10, sp);
6450 __ Mov(sp, 0xabcabcabcabcabca); // Set sp to a known value.
6451 __ Cpy(z16.VnB(), pg, sp);
6452 __ Cpy(z17.VnH(), pg, wsp);
6453 __ Cpy(z18.VnS(), pg, wsp);
6454 __ Cpy(z19.VnD(), pg, sp);
6455 __ Mov(sp, x10); // Restore sp.
6456
6457 END();
6458
6459 if (CAN_RUN()) {
6460 RUN();
6461 // clang-format off
6462
6463 uint64_t expected_b[] =
6464 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6465 {0xe9eaebec424242f0, 0x42f2f34242f64242, 0xf942fbfcfdfeff42};
6466 ASSERT_EQUAL_SVE(expected_b, z0.VnD());
6467 ASSERT_EQUAL_SVE(expected_b, z4.VnD());
6468
6469 uint64_t expected_h[] =
6470 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6471 {0xe9eaebec8421eff0, 0xf1f28421f5f68421, 0x8421fbfcfdfe8421};
6472 ASSERT_EQUAL_SVE(expected_h, z1.VnD());
6473 ASSERT_EQUAL_SVE(expected_h, z5.VnD());
6474
6475 uint64_t expected_s[] =
6476 // pg: 0 0 1 1 0 1
6477 {0xe9eaebecedeeeff0, 0x8004200180042001, 0xf9fafbfc80042001};
6478 ASSERT_EQUAL_SVE(expected_s, z2.VnD());
6479 ASSERT_EQUAL_SVE(expected_s, z6.VnD());
6480
6481 uint64_t expected_d[] =
6482 // pg: 0 1 1
6483 {0xe9eaebecedeeeff0, 0x8000000420000001, 0x8000000420000001};
6484 ASSERT_EQUAL_SVE(expected_d, z3.VnD());
6485 ASSERT_EQUAL_SVE(expected_d, z7.VnD());
6486
6487
6488 uint64_t expected_b_sp[] =
6489 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6490 {0xe9eaebeccacacaf0, 0xcaf2f3cacaf6caca, 0xf9cafbfcfdfeffca};
6491 ASSERT_EQUAL_SVE(expected_b_sp, z16.VnD());
6492
6493 uint64_t expected_h_sp[] =
6494 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6495 {0xe9eaebecabcaeff0, 0xf1f2abcaf5f6abca, 0xabcafbfcfdfeabca};
6496 ASSERT_EQUAL_SVE(expected_h_sp, z17.VnD());
6497
6498 uint64_t expected_s_sp[] =
6499 // pg: 0 0 1 1 0 1
6500 {0xe9eaebecedeeeff0, 0xcabcabcacabcabca, 0xf9fafbfccabcabca};
6501 ASSERT_EQUAL_SVE(expected_s_sp, z18.VnD());
6502
6503 uint64_t expected_d_sp[] =
6504 // pg: 0 1 1
6505 {0xe9eaebecedeeeff0, 0xabcabcabcabcabca, 0xabcabcabcabcabca};
6506 ASSERT_EQUAL_SVE(expected_d_sp, z19.VnD());
6507
6508 // clang-format on
6509 }
6510}
6511
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006512TEST_SVE(sve_cpy_imm) {
6513 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6514 START();
6515
6516 // For simplicity, we re-use the same pg for various lane sizes.
6517 // For D lanes: 0, 1, 1
6518 // For S lanes: 0, 1, 1, 0, 1
6519 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6520 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6521
6522 PRegister pg = p7;
6523 Initialise(&masm, pg.VnB(), pg_in);
6524
6525 // These are (mostly) merging operations, so we have to initialise the result
6526 // registers for each operation.
6527 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6528 __ Index(ZRegister(i, kBRegSize), 0, -1);
6529 }
6530
6531 // Encodable integer forms (CPY_z_p_i)
6532 __ Cpy(z0.VnB(), pg.Merging(), 0);
6533 __ Cpy(z1.VnB(), pg.Zeroing(), 42);
6534 __ Cpy(z2.VnB(), pg.Merging(), -42);
6535 __ Cpy(z3.VnB(), pg.Zeroing(), 0xff);
6536 __ Cpy(z4.VnH(), pg.Merging(), 127);
6537 __ Cpy(z5.VnS(), pg.Zeroing(), -128);
6538 __ Cpy(z6.VnD(), pg.Merging(), -1);
6539
6540 // Forms encodable using fcpy.
6541 __ Cpy(z7.VnH(), pg.Merging(), Float16ToRawbits(Float16(-31.0)));
6542 __ Cpy(z8.VnS(), pg.Zeroing(), FloatToRawbits(2.0f));
6543 __ Cpy(z9.VnD(), pg.Merging(), DoubleToRawbits(-4.0));
6544
6545 // Other forms use a scratch register.
6546 __ Cpy(z10.VnH(), pg.Merging(), 0xff);
6547 __ Cpy(z11.VnD(), pg.Zeroing(), 0x0123456789abcdef);
6548
6549 END();
6550
6551 if (CAN_RUN()) {
6552 RUN();
6553 // clang-format off
6554
6555 uint64_t expected_z0[] =
6556 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6557 {0xe9eaebec000000f0, 0x00f2f30000f60000, 0xf900fbfcfdfeff00};
6558 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6559
6560 uint64_t expected_z1[] =
6561 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6562 {0x000000002a2a2a00, 0x2a00002a2a002a2a, 0x002a00000000002a};
6563 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6564
6565 uint64_t expected_z2[] =
6566 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6567 {0xe9eaebecd6d6d6f0, 0xd6f2f3d6d6f6d6d6, 0xf9d6fbfcfdfeffd6};
6568 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6569
6570 uint64_t expected_z3[] =
6571 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6572 {0x00000000ffffff00, 0xff0000ffff00ffff, 0x00ff0000000000ff};
6573 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6574
6575 uint64_t expected_z4[] =
6576 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6577 {0xe9eaebec007feff0, 0xf1f2007ff5f6007f, 0x007ffbfcfdfe007f};
6578 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6579
6580 uint64_t expected_z5[] =
6581 // pg: 0 0 1 1 0 1
6582 {0x0000000000000000, 0xffffff80ffffff80, 0x00000000ffffff80};
6583 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6584
6585 uint64_t expected_z6[] =
6586 // pg: 0 1 1
6587 {0xe9eaebecedeeeff0, 0xffffffffffffffff, 0xffffffffffffffff};
6588 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6589
6590 uint64_t expected_z7[] =
6591 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6592 {0xe9eaebeccfc0eff0, 0xf1f2cfc0f5f6cfc0, 0xcfc0fbfcfdfecfc0};
6593 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6594
6595 uint64_t expected_z8[] =
6596 // pg: 0 0 1 1 0 1
6597 {0x0000000000000000, 0x4000000040000000, 0x0000000040000000};
6598 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6599
6600 uint64_t expected_z9[] =
6601 // pg: 0 1 1
6602 {0xe9eaebecedeeeff0, 0xc010000000000000, 0xc010000000000000};
6603 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6604
6605 uint64_t expected_z10[] =
6606 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6607 {0xe9eaebec00ffeff0, 0xf1f200fff5f600ff, 0x00fffbfcfdfe00ff};
6608 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6609
6610 uint64_t expected_z11[] =
6611 // pg: 0 1 1
6612 {0x0000000000000000, 0x0123456789abcdef, 0x0123456789abcdef};
6613 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6614
6615 // clang-format on
6616 }
6617}
6618
6619TEST_SVE(sve_fcpy_imm) {
6620 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6621 START();
6622
6623 // For simplicity, we re-use the same pg for various lane sizes.
6624 // For D lanes: 0, 1, 1
6625 // For S lanes: 0, 1, 1, 0, 1
6626 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6627 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6628
6629 PRegister pg = p7;
6630 Initialise(&masm, pg.VnB(), pg_in);
6631
6632 // These are (mostly) merging operations, so we have to initialise the result
6633 // registers for each operation.
6634 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6635 __ Index(ZRegister(i, kBRegSize), 0, -1);
6636 }
6637
6638 // Encodable floating-point forms (FCPY_z_p_i)
6639 __ Fcpy(z1.VnH(), pg.Merging(), Float16(1.0));
6640 __ Fcpy(z2.VnH(), pg.Merging(), -2.0f);
6641 __ Fcpy(z3.VnH(), pg.Merging(), 3.0);
6642 __ Fcpy(z4.VnS(), pg.Merging(), Float16(-4.0));
6643 __ Fcpy(z5.VnS(), pg.Merging(), 5.0f);
6644 __ Fcpy(z6.VnS(), pg.Merging(), 6.0);
6645 __ Fcpy(z7.VnD(), pg.Merging(), Float16(7.0));
6646 __ Fcpy(z8.VnD(), pg.Merging(), 8.0f);
Martyn Capewell7db82102020-06-02 16:40:09 +01006647 __ Fmov(z9.VnD(), pg.Merging(), -9.0);
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006648
6649 // Unencodable immediates.
6650 __ Fcpy(z10.VnS(), pg.Merging(), 0.0);
6651 __ Fcpy(z11.VnH(), pg.Merging(), Float16(42.0));
6652 __ Fcpy(z12.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
6653 __ Fcpy(z13.VnH(), pg.Merging(), kFP64NegativeInfinity);
6654
Martyn Capewell7db82102020-06-02 16:40:09 +01006655 // Fmov alias.
6656 __ Fmov(z14.VnS(), pg.Merging(), 0.0);
6657 __ Fmov(z15.VnH(), pg.Merging(), Float16(42.0));
6658 __ Fmov(z16.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
6659 __ Fmov(z17.VnH(), pg.Merging(), kFP64NegativeInfinity);
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006660 END();
6661
6662 if (CAN_RUN()) {
6663 RUN();
6664 // clang-format off
6665
6666 // 1.0 as FP16: 0x3c00
6667 uint64_t expected_z1[] =
6668 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6669 {0xe9eaebec3c00eff0, 0xf1f23c00f5f63c00, 0x3c00fbfcfdfe3c00};
6670 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6671
6672 // -2.0 as FP16: 0xc000
6673 uint64_t expected_z2[] =
6674 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6675 {0xe9eaebecc000eff0, 0xf1f2c000f5f6c000, 0xc000fbfcfdfec000};
6676 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6677
6678 // 3.0 as FP16: 0x4200
6679 uint64_t expected_z3[] =
6680 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6681 {0xe9eaebec4200eff0, 0xf1f24200f5f64200, 0x4200fbfcfdfe4200};
6682 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6683
6684 // -4.0 as FP32: 0xc0800000
6685 uint64_t expected_z4[] =
6686 // pg: 0 0 1 1 0 1
6687 {0xe9eaebecedeeeff0, 0xc0800000c0800000, 0xf9fafbfcc0800000};
6688 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6689
6690 // 5.0 as FP32: 0x40a00000
6691 uint64_t expected_z5[] =
6692 // pg: 0 0 1 1 0 1
6693 {0xe9eaebecedeeeff0, 0x40a0000040a00000, 0xf9fafbfc40a00000};
6694 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6695
6696 // 6.0 as FP32: 0x40c00000
6697 uint64_t expected_z6[] =
6698 // pg: 0 0 1 1 0 1
6699 {0xe9eaebecedeeeff0, 0x40c0000040c00000, 0xf9fafbfc40c00000};
6700 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6701
6702 // 7.0 as FP64: 0x401c000000000000
6703 uint64_t expected_z7[] =
6704 // pg: 0 1 1
6705 {0xe9eaebecedeeeff0, 0x401c000000000000, 0x401c000000000000};
6706 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6707
6708 // 8.0 as FP64: 0x4020000000000000
6709 uint64_t expected_z8[] =
6710 // pg: 0 1 1
6711 {0xe9eaebecedeeeff0, 0x4020000000000000, 0x4020000000000000};
6712 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6713
6714 // -9.0 as FP64: 0xc022000000000000
6715 uint64_t expected_z9[] =
6716 // pg: 0 1 1
6717 {0xe9eaebecedeeeff0, 0xc022000000000000, 0xc022000000000000};
6718 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6719
6720 // 0.0 as FP32: 0x00000000
6721 uint64_t expected_z10[] =
6722 // pg: 0 0 1 1 0 1
6723 {0xe9eaebecedeeeff0, 0x0000000000000000, 0xf9fafbfc00000000};
6724 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6725
6726 // 42.0 as FP16: 0x5140
6727 uint64_t expected_z11[] =
6728 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6729 {0xe9eaebec5140eff0, 0xf1f25140f5f65140, 0x5140fbfcfdfe5140};
6730 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6731
6732 // Signalling NaN (with payload): 0x7ff0000012340000
6733 uint64_t expected_z12[] =
6734 // pg: 0 1 1
6735 {0xe9eaebecedeeeff0, 0x7ff0000012340000, 0x7ff0000012340000};
6736 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
6737
6738 // -infinity as FP16: 0xfc00
6739 uint64_t expected_z13[] =
6740 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6741 {0xe9eaebecfc00eff0, 0xf1f2fc00f5f6fc00, 0xfc00fbfcfdfefc00};
6742 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
6743
Martyn Capewell7db82102020-06-02 16:40:09 +01006744 ASSERT_EQUAL_SVE(z10.VnD(), z14.VnD());
6745 ASSERT_EQUAL_SVE(z11.VnD(), z15.VnD());
6746 ASSERT_EQUAL_SVE(z12.VnD(), z16.VnD());
6747 ASSERT_EQUAL_SVE(z13.VnD(), z17.VnD());
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006748 // clang-format on
6749 }
6750}
6751
TatWai Chong4f28df72019-08-14 17:50:30 -07006752TEST_SVE(sve_permute_vector_unpredicated_table_lookup) {
6753 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6754 START();
6755
6756 uint64_t table_inputs[] = {0xffeeddccbbaa9988, 0x7766554433221100};
6757
6758 int index_b[] = {255, 255, 11, 10, 15, 14, 13, 12, 1, 0, 4, 3, 7, 6, 5, 4};
6759
6760 int index_h[] = {5, 6, 7, 8, 2, 3, 6, 4};
6761
6762 int index_s[] = {1, 3, 2, 31, -1};
6763
6764 int index_d[] = {31, 1};
6765
6766 // Initialize the register with a value that doesn't existed in the table.
6767 __ Dup(z9.VnB(), 0x1f);
6768 InsrHelper(&masm, z9.VnD(), table_inputs);
6769
6770 ZRegister ind_b = z0.WithLaneSize(kBRegSize);
6771 ZRegister ind_h = z1.WithLaneSize(kHRegSize);
6772 ZRegister ind_s = z2.WithLaneSize(kSRegSize);
6773 ZRegister ind_d = z3.WithLaneSize(kDRegSize);
6774
6775 InsrHelper(&masm, ind_b, index_b);
6776 InsrHelper(&masm, ind_h, index_h);
6777 InsrHelper(&masm, ind_s, index_s);
6778 InsrHelper(&masm, ind_d, index_d);
6779
6780 __ Tbl(z26.VnB(), z9.VnB(), ind_b);
6781
6782 __ Tbl(z27.VnH(), z9.VnH(), ind_h);
6783
6784 __ Tbl(z28.VnS(), z9.VnS(), ind_s);
6785
6786 __ Tbl(z29.VnD(), z9.VnD(), ind_d);
6787
6788 END();
6789
6790 if (CAN_RUN()) {
6791 RUN();
6792
6793 // clang-format off
6794 unsigned z26_expected[] = {0x1f, 0x1f, 0xbb, 0xaa, 0xff, 0xee, 0xdd, 0xcc,
6795 0x11, 0x00, 0x44, 0x33, 0x77, 0x66, 0x55, 0x44};
6796
6797 unsigned z27_expected[] = {0xbbaa, 0xddcc, 0xffee, 0x1f1f,
6798 0x5544, 0x7766, 0xddcc, 0x9988};
6799
6800 unsigned z28_expected[] =
6801 {0x77665544, 0xffeeddcc, 0xbbaa9988, 0x1f1f1f1f, 0x1f1f1f1f};
6802
6803 uint64_t z29_expected[] = {0x1f1f1f1f1f1f1f1f, 0xffeeddccbbaa9988};
6804 // clang-format on
6805
6806 unsigned vl = config->sve_vl_in_bits();
6807 for (size_t i = 0; i < ArrayLength(index_b); i++) {
6808 int lane = static_cast<int>(ArrayLength(index_b) - i - 1);
6809 if (!core.HasSVELane(z26.VnB(), lane)) break;
6810 uint64_t expected = (vl > (index_b[i] * kBRegSize)) ? z26_expected[i] : 0;
6811 ASSERT_EQUAL_SVE_LANE(expected, z26.VnB(), lane);
6812 }
6813
6814 for (size_t i = 0; i < ArrayLength(index_h); i++) {
6815 int lane = static_cast<int>(ArrayLength(index_h) - i - 1);
6816 if (!core.HasSVELane(z27.VnH(), lane)) break;
6817 uint64_t expected = (vl > (index_h[i] * kHRegSize)) ? z27_expected[i] : 0;
6818 ASSERT_EQUAL_SVE_LANE(expected, z27.VnH(), lane);
6819 }
6820
6821 for (size_t i = 0; i < ArrayLength(index_s); i++) {
6822 int lane = static_cast<int>(ArrayLength(index_s) - i - 1);
6823 if (!core.HasSVELane(z28.VnS(), lane)) break;
6824 uint64_t expected = (vl > (index_s[i] * kSRegSize)) ? z28_expected[i] : 0;
6825 ASSERT_EQUAL_SVE_LANE(expected, z28.VnS(), lane);
6826 }
6827
6828 for (size_t i = 0; i < ArrayLength(index_d); i++) {
6829 int lane = static_cast<int>(ArrayLength(index_d) - i - 1);
6830 if (!core.HasSVELane(z29.VnD(), lane)) break;
6831 uint64_t expected = (vl > (index_d[i] * kDRegSize)) ? z29_expected[i] : 0;
6832 ASSERT_EQUAL_SVE_LANE(expected, z29.VnD(), lane);
6833 }
6834 }
6835}
6836
Jacob Bramley199339d2019-08-05 18:49:13 +01006837TEST_SVE(ldr_str_z_bi) {
6838 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6839 START();
6840
6841 int vl = config->sve_vl_in_bytes();
6842
6843 // The immediate can address [-256, 255] times the VL, so allocate enough
6844 // space to exceed that in both directions.
6845 int data_size = vl * 1024;
6846
6847 uint8_t* data = new uint8_t[data_size];
6848 memset(data, 0, data_size);
6849
6850 // Set the base half-way through the buffer so we can use negative indices.
6851 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6852
6853 __ Index(z1.VnB(), 1, 3);
6854 __ Index(z2.VnB(), 2, 5);
6855 __ Index(z3.VnB(), 3, 7);
6856 __ Index(z4.VnB(), 4, 11);
6857 __ Index(z5.VnB(), 5, 13);
6858 __ Index(z6.VnB(), 6, 2);
6859 __ Index(z7.VnB(), 7, 3);
6860 __ Index(z8.VnB(), 8, 5);
6861 __ Index(z9.VnB(), 9, 7);
6862
6863 // Encodable cases.
6864 __ Str(z1, SVEMemOperand(x0));
6865 __ Str(z2, SVEMemOperand(x0, 2, SVE_MUL_VL));
6866 __ Str(z3, SVEMemOperand(x0, -3, SVE_MUL_VL));
6867 __ Str(z4, SVEMemOperand(x0, 255, SVE_MUL_VL));
6868 __ Str(z5, SVEMemOperand(x0, -256, SVE_MUL_VL));
6869
Jacob Bramley6ebbba62019-10-09 15:02:10 +01006870 // Cases that fall back on `CalculateSVEAddress`.
Jacob Bramley199339d2019-08-05 18:49:13 +01006871 __ Str(z6, SVEMemOperand(x0, 6 * vl));
6872 __ Str(z7, SVEMemOperand(x0, -7 * vl));
6873 __ Str(z8, SVEMemOperand(x0, 314, SVE_MUL_VL));
6874 __ Str(z9, SVEMemOperand(x0, -314, SVE_MUL_VL));
6875
6876 // Corresponding loads.
6877 __ Ldr(z11, SVEMemOperand(x0, xzr)); // Test xzr operand.
6878 __ Ldr(z12, SVEMemOperand(x0, 2, SVE_MUL_VL));
6879 __ Ldr(z13, SVEMemOperand(x0, -3, SVE_MUL_VL));
6880 __ Ldr(z14, SVEMemOperand(x0, 255, SVE_MUL_VL));
6881 __ Ldr(z15, SVEMemOperand(x0, -256, SVE_MUL_VL));
6882
6883 __ Ldr(z16, SVEMemOperand(x0, 6 * vl));
6884 __ Ldr(z17, SVEMemOperand(x0, -7 * vl));
6885 __ Ldr(z18, SVEMemOperand(x0, 314, SVE_MUL_VL));
6886 __ Ldr(z19, SVEMemOperand(x0, -314, SVE_MUL_VL));
6887
6888 END();
6889
6890 if (CAN_RUN()) {
6891 RUN();
6892
6893 uint8_t* expected = new uint8_t[data_size];
6894 memset(expected, 0, data_size);
6895 uint8_t* middle = &expected[data_size / 2];
6896
6897 for (int i = 0; i < vl; i++) {
6898 middle[i] = (1 + (3 * i)) & 0xff; // z1
6899 middle[(2 * vl) + i] = (2 + (5 * i)) & 0xff; // z2
6900 middle[(-3 * vl) + i] = (3 + (7 * i)) & 0xff; // z3
6901 middle[(255 * vl) + i] = (4 + (11 * i)) & 0xff; // z4
6902 middle[(-256 * vl) + i] = (5 + (13 * i)) & 0xff; // z5
6903 middle[(6 * vl) + i] = (6 + (2 * i)) & 0xff; // z6
6904 middle[(-7 * vl) + i] = (7 + (3 * i)) & 0xff; // z7
6905 middle[(314 * vl) + i] = (8 + (5 * i)) & 0xff; // z8
6906 middle[(-314 * vl) + i] = (9 + (7 * i)) & 0xff; // z9
6907 }
6908
Jacob Bramley33c99f92019-10-08 15:24:12 +01006909 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
Jacob Bramley199339d2019-08-05 18:49:13 +01006910
6911 ASSERT_EQUAL_SVE(z1, z11);
6912 ASSERT_EQUAL_SVE(z2, z12);
6913 ASSERT_EQUAL_SVE(z3, z13);
6914 ASSERT_EQUAL_SVE(z4, z14);
6915 ASSERT_EQUAL_SVE(z5, z15);
6916 ASSERT_EQUAL_SVE(z6, z16);
6917 ASSERT_EQUAL_SVE(z7, z17);
6918 ASSERT_EQUAL_SVE(z8, z18);
6919 ASSERT_EQUAL_SVE(z9, z19);
6920
6921 delete[] expected;
6922 }
6923 delete[] data;
6924}
6925
6926TEST_SVE(ldr_str_p_bi) {
6927 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6928 START();
6929
6930 int vl = config->sve_vl_in_bytes();
6931 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
6932 int pl = vl / kZRegBitsPerPRegBit;
6933
6934 // The immediate can address [-256, 255] times the PL, so allocate enough
6935 // space to exceed that in both directions.
6936 int data_size = pl * 1024;
6937
6938 uint8_t* data = new uint8_t[data_size];
6939 memset(data, 0, data_size);
6940
6941 // Set the base half-way through the buffer so we can use negative indices.
6942 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6943
6944 uint64_t pattern[4] = {0x1010101011101111,
6945 0x0010111011000101,
6946 0x1001101110010110,
6947 0x1010110101100011};
6948 for (int i = 8; i <= 15; i++) {
6949 // Initialise p8-p15 with a conveniently-recognisable, non-zero pattern.
6950 Initialise(&masm,
6951 PRegister(i),
6952 pattern[3] * i,
6953 pattern[2] * i,
6954 pattern[1] * i,
6955 pattern[0] * i);
6956 }
6957
6958 // Encodable cases.
6959 __ Str(p8, SVEMemOperand(x0));
6960 __ Str(p9, SVEMemOperand(x0, 2, SVE_MUL_VL));
6961 __ Str(p10, SVEMemOperand(x0, -3, SVE_MUL_VL));
6962 __ Str(p11, SVEMemOperand(x0, 255, SVE_MUL_VL));
6963
Jacob Bramley6ebbba62019-10-09 15:02:10 +01006964 // Cases that fall back on `CalculateSVEAddress`.
Jacob Bramley199339d2019-08-05 18:49:13 +01006965 __ Str(p12, SVEMemOperand(x0, 6 * pl));
6966 __ Str(p13, SVEMemOperand(x0, -7 * pl));
6967 __ Str(p14, SVEMemOperand(x0, 314, SVE_MUL_VL));
6968 __ Str(p15, SVEMemOperand(x0, -314, SVE_MUL_VL));
6969
6970 // Corresponding loads.
6971 __ Ldr(p0, SVEMemOperand(x0));
6972 __ Ldr(p1, SVEMemOperand(x0, 2, SVE_MUL_VL));
6973 __ Ldr(p2, SVEMemOperand(x0, -3, SVE_MUL_VL));
6974 __ Ldr(p3, SVEMemOperand(x0, 255, SVE_MUL_VL));
6975
6976 __ Ldr(p4, SVEMemOperand(x0, 6 * pl));
6977 __ Ldr(p5, SVEMemOperand(x0, -7 * pl));
6978 __ Ldr(p6, SVEMemOperand(x0, 314, SVE_MUL_VL));
6979 __ Ldr(p7, SVEMemOperand(x0, -314, SVE_MUL_VL));
6980
6981 END();
6982
6983 if (CAN_RUN()) {
6984 RUN();
6985
6986 uint8_t* expected = new uint8_t[data_size];
6987 memset(expected, 0, data_size);
6988 uint8_t* middle = &expected[data_size / 2];
6989
6990 for (int i = 0; i < pl; i++) {
6991 int bit_index = (i % sizeof(pattern[0])) * kBitsPerByte;
6992 size_t index = i / sizeof(pattern[0]);
6993 VIXL_ASSERT(index < ArrayLength(pattern));
6994 uint64_t byte = (pattern[index] >> bit_index) & 0xff;
6995 // Each byte of `pattern` can be multiplied by 15 without carry.
6996 VIXL_ASSERT((byte * 15) <= 0xff);
6997
6998 middle[i] = byte * 8; // p8
6999 middle[(2 * pl) + i] = byte * 9; // p9
7000 middle[(-3 * pl) + i] = byte * 10; // p10
7001 middle[(255 * pl) + i] = byte * 11; // p11
7002 middle[(6 * pl) + i] = byte * 12; // p12
7003 middle[(-7 * pl) + i] = byte * 13; // p13
7004 middle[(314 * pl) + i] = byte * 14; // p14
7005 middle[(-314 * pl) + i] = byte * 15; // p15
7006 }
7007
Jacob Bramley33c99f92019-10-08 15:24:12 +01007008 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
Jacob Bramley199339d2019-08-05 18:49:13 +01007009
7010 ASSERT_EQUAL_SVE(p0, p8);
7011 ASSERT_EQUAL_SVE(p1, p9);
7012 ASSERT_EQUAL_SVE(p2, p10);
7013 ASSERT_EQUAL_SVE(p3, p11);
7014 ASSERT_EQUAL_SVE(p4, p12);
7015 ASSERT_EQUAL_SVE(p5, p13);
7016 ASSERT_EQUAL_SVE(p6, p14);
7017 ASSERT_EQUAL_SVE(p7, p15);
7018
7019 delete[] expected;
7020 }
7021 delete[] data;
7022}
7023
Jacob Bramleye668b202019-08-14 17:57:34 +01007024template <typename T>
7025static void MemoryWrite(uint8_t* base, int64_t offset, int64_t index, T data) {
7026 memcpy(base + offset + (index * sizeof(data)), &data, sizeof(data));
7027}
7028
7029TEST_SVE(sve_ld1_st1_contiguous) {
7030 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7031 START();
7032
7033 int vl = config->sve_vl_in_bytes();
7034
7035 // The immediate can address [-8, 7] times the VL, so allocate enough space to
7036 // exceed that in both directions.
7037 int data_size = vl * 128;
7038
7039 uint8_t* data = new uint8_t[data_size];
7040 memset(data, 0, data_size);
7041
Martyn Capewell452ad8b2020-03-19 15:49:57 +00007042 // Set the base half-way through the buffer so we can use negative indices.
Jacob Bramleye668b202019-08-14 17:57:34 +01007043 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7044
Jacob Bramleye668b202019-08-14 17:57:34 +01007045 // Encodable scalar-plus-immediate cases.
7046 __ Index(z1.VnB(), 1, -3);
7047 __ Ptrue(p1.VnB());
7048 __ St1b(z1.VnB(), p1, SVEMemOperand(x0));
7049
7050 __ Index(z2.VnH(), -2, 5);
7051 __ Ptrue(p2.VnH(), SVE_MUL3);
7052 __ St1b(z2.VnH(), p2, SVEMemOperand(x0, 7, SVE_MUL_VL));
7053
7054 __ Index(z3.VnS(), 3, -7);
7055 __ Ptrue(p3.VnS(), SVE_POW2);
7056 __ St1h(z3.VnS(), p3, SVEMemOperand(x0, -8, SVE_MUL_VL));
7057
7058 // Encodable scalar-plus-scalar cases.
7059 __ Index(z4.VnD(), -4, 11);
7060 __ Ptrue(p4.VnD(), SVE_VL3);
7061 __ Addvl(x1, x0, 8); // Try not to overlap with VL-dependent cases.
7062 __ Mov(x2, 17);
7063 __ St1b(z4.VnD(), p4, SVEMemOperand(x1, x2));
7064
7065 __ Index(z5.VnD(), 6, -2);
7066 __ Ptrue(p5.VnD(), SVE_VL16);
TatWai Chong6205eb42019-09-24 10:07:20 +01007067 __ Addvl(x3, x0, 10); // Try not to overlap with VL-dependent cases.
7068 __ Mov(x4, 6);
7069 __ St1d(z5.VnD(), p5, SVEMemOperand(x3, x4, LSL, 3));
Jacob Bramleye668b202019-08-14 17:57:34 +01007070
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007071 // Unencodable cases fall back on `CalculateSVEAddress`.
Jacob Bramleye668b202019-08-14 17:57:34 +01007072 __ Index(z6.VnS(), -7, 3);
7073 // Setting SVE_ALL on B lanes checks that the Simulator ignores irrelevant
7074 // predicate bits when handling larger lanes.
7075 __ Ptrue(p6.VnB(), SVE_ALL);
7076 __ St1w(z6.VnS(), p6, SVEMemOperand(x0, 42, SVE_MUL_VL));
7077
TatWai Chong6205eb42019-09-24 10:07:20 +01007078 __ Index(z7.VnD(), 32, -11);
7079 __ Ptrue(p7.VnD(), SVE_MUL4);
7080 __ St1w(z7.VnD(), p7, SVEMemOperand(x0, 22, SVE_MUL_VL));
Jacob Bramleye668b202019-08-14 17:57:34 +01007081
TatWai Chong6205eb42019-09-24 10:07:20 +01007082 // Corresponding loads.
7083 __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0));
7084 __ Ld1b(z9.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7085 __ Ld1h(z10.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7086 __ Ld1b(z11.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7087 __ Ld1d(z12.VnD(), p5.Zeroing(), SVEMemOperand(x3, x4, LSL, 3));
7088 __ Ld1w(z13.VnS(), p6.Zeroing(), SVEMemOperand(x0, 42, SVE_MUL_VL));
7089
7090 __ Ld1sb(z14.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7091 __ Ld1sh(z15.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7092 __ Ld1sb(z16.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7093 __ Ld1sw(z17.VnD(), p7.Zeroing(), SVEMemOperand(x0, 22, SVE_MUL_VL));
7094
7095 // We can test ld1 by comparing the value loaded with the value stored. In
7096 // most cases, there are two complications:
7097 // - Loads have zeroing predication, so we have to clear the inactive
7098 // elements on our reference.
7099 // - We have to replicate any sign- or zero-extension.
7100
7101 // Ld1b(z8.VnB(), ...)
7102 __ Dup(z18.VnB(), 0);
7103 __ Mov(z18.VnB(), p1.Merging(), z1.VnB());
7104
7105 // Ld1b(z9.VnH(), ...)
7106 __ Dup(z19.VnH(), 0);
7107 __ Uxtb(z19.VnH(), p2.Merging(), z2.VnH());
7108
7109 // Ld1h(z10.VnS(), ...)
7110 __ Dup(z20.VnS(), 0);
7111 __ Uxth(z20.VnS(), p3.Merging(), z3.VnS());
7112
7113 // Ld1b(z11.VnD(), ...)
7114 __ Dup(z21.VnD(), 0);
7115 __ Uxtb(z21.VnD(), p4.Merging(), z4.VnD());
7116
7117 // Ld1d(z12.VnD(), ...)
7118 __ Dup(z22.VnD(), 0);
7119 __ Mov(z22.VnD(), p5.Merging(), z5.VnD());
7120
7121 // Ld1w(z13.VnS(), ...)
7122 __ Dup(z23.VnS(), 0);
7123 __ Mov(z23.VnS(), p6.Merging(), z6.VnS());
7124
7125 // Ld1sb(z14.VnH(), ...)
7126 __ Dup(z24.VnH(), 0);
7127 __ Sxtb(z24.VnH(), p2.Merging(), z2.VnH());
7128
7129 // Ld1sh(z15.VnS(), ...)
7130 __ Dup(z25.VnS(), 0);
7131 __ Sxth(z25.VnS(), p3.Merging(), z3.VnS());
7132
7133 // Ld1sb(z16.VnD(), ...)
7134 __ Dup(z26.VnD(), 0);
7135 __ Sxtb(z26.VnD(), p4.Merging(), z4.VnD());
7136
7137 // Ld1sw(z17.VnD(), ...)
7138 __ Dup(z27.VnD(), 0);
7139 __ Sxtw(z27.VnD(), p7.Merging(), z7.VnD());
Jacob Bramleye668b202019-08-14 17:57:34 +01007140
7141 END();
7142
7143 if (CAN_RUN()) {
7144 RUN();
7145
7146 uint8_t* expected = new uint8_t[data_size];
7147 memset(expected, 0, data_size);
7148 uint8_t* middle = &expected[data_size / 2];
7149
7150 int vl_b = vl / kBRegSizeInBytes;
7151 int vl_h = vl / kHRegSizeInBytes;
7152 int vl_s = vl / kSRegSizeInBytes;
7153 int vl_d = vl / kDRegSizeInBytes;
7154
7155 // Encodable cases.
7156
7157 // st1b { z1.b }, SVE_ALL
7158 for (int i = 0; i < vl_b; i++) {
7159 MemoryWrite(middle, 0, i, static_cast<uint8_t>(1 - (3 * i)));
7160 }
7161
7162 // st1b { z2.h }, SVE_MUL3
7163 int vl_h_mul3 = vl_h - (vl_h % 3);
7164 for (int i = 0; i < vl_h_mul3; i++) {
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007165 int64_t offset = 7 * static_cast<int>(vl / (kHRegSize / kBRegSize));
7166 MemoryWrite(middle, offset, i, static_cast<uint8_t>(-2 + (5 * i)));
Jacob Bramleye668b202019-08-14 17:57:34 +01007167 }
7168
7169 // st1h { z3.s }, SVE_POW2
7170 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7171 for (int i = 0; i < vl_s_pow2; i++) {
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007172 int64_t offset = -8 * static_cast<int>(vl / (kSRegSize / kHRegSize));
7173 MemoryWrite(middle, offset, i, static_cast<uint16_t>(3 - (7 * i)));
Jacob Bramleye668b202019-08-14 17:57:34 +01007174 }
7175
7176 // st1b { z4.d }, SVE_VL3
7177 if (vl_d >= 3) {
7178 for (int i = 0; i < 3; i++) {
7179 MemoryWrite(middle,
7180 (8 * vl) + 17,
7181 i,
7182 static_cast<uint8_t>(-4 + (11 * i)));
7183 }
7184 }
7185
7186 // st1d { z5.d }, SVE_VL16
7187 if (vl_d >= 16) {
7188 for (int i = 0; i < 16; i++) {
7189 MemoryWrite(middle,
7190 (10 * vl) + (6 * kDRegSizeInBytes),
7191 i,
7192 static_cast<uint64_t>(6 - (2 * i)));
7193 }
7194 }
7195
7196 // Unencodable cases.
7197
7198 // st1w { z6.s }, SVE_ALL
7199 for (int i = 0; i < vl_s; i++) {
7200 MemoryWrite(middle, 42 * vl, i, static_cast<uint32_t>(-7 + (3 * i)));
7201 }
7202
TatWai Chong6205eb42019-09-24 10:07:20 +01007203 // st1w { z7.d }, SVE_MUL4
7204 int vl_d_mul4 = vl_d - (vl_d % 4);
7205 for (int i = 0; i < vl_d_mul4; i++) {
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007206 int64_t offset = 22 * static_cast<int>(vl / (kDRegSize / kWRegSize));
7207 MemoryWrite(middle, offset, i, static_cast<uint32_t>(32 + (-11 * i)));
TatWai Chong6205eb42019-09-24 10:07:20 +01007208 }
7209
Jacob Bramley33c99f92019-10-08 15:24:12 +01007210 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
Jacob Bramleye668b202019-08-14 17:57:34 +01007211
TatWai Chong6205eb42019-09-24 10:07:20 +01007212 // Check that we loaded back the expected values.
7213
7214 ASSERT_EQUAL_SVE(z18, z8);
7215 ASSERT_EQUAL_SVE(z19, z9);
7216 ASSERT_EQUAL_SVE(z20, z10);
7217 ASSERT_EQUAL_SVE(z21, z11);
7218 ASSERT_EQUAL_SVE(z22, z12);
7219 ASSERT_EQUAL_SVE(z23, z13);
7220 ASSERT_EQUAL_SVE(z24, z14);
7221 ASSERT_EQUAL_SVE(z25, z15);
7222 ASSERT_EQUAL_SVE(z26, z16);
7223 ASSERT_EQUAL_SVE(z27, z17);
7224
Jacob Bramleye668b202019-08-14 17:57:34 +01007225 delete[] expected;
7226 }
7227 delete[] data;
7228}
7229
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007230TEST_SVE(sve_ld2_st2_scalar_plus_imm) {
7231 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7232 START();
7233
7234 int vl = config->sve_vl_in_bytes();
7235
7236 // The immediate can address [-16, 14] times the VL, so allocate enough space
7237 // to exceed that in both directions.
7238 int data_size = vl * 128;
7239
7240 uint8_t* data = new uint8_t[data_size];
7241 memset(data, 0, data_size);
7242
7243 // Set the base half-way through the buffer so we can use negative indeces.
7244 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7245
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007246 __ Index(z14.VnB(), 1, -3);
7247 __ Index(z15.VnB(), 2, -3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007248 __ Ptrue(p0.VnB());
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007249 __ St2b(z14.VnB(), z15.VnB(), p0, SVEMemOperand(x0));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007250
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007251 __ Index(z16.VnH(), -2, 5);
7252 __ Index(z17.VnH(), -3, 5);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007253 __ Ptrue(p1.VnH(), SVE_MUL3);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007254 __ St2h(z16.VnH(), z17.VnH(), p1, SVEMemOperand(x0, 8, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007255
7256 // Wrap around from z31 to z0.
7257 __ Index(z31.VnS(), 3, -7);
7258 __ Index(z0.VnS(), 4, -7);
7259 __ Ptrue(p2.VnS(), SVE_POW2);
7260 __ St2w(z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL));
7261
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007262 __ Index(z18.VnD(), -7, 3);
7263 __ Index(z19.VnD(), -8, 3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007264 // Sparse predication, including some irrelevant bits (0xe). To make the
7265 // results easy to check, activate each lane <n> where n is a multiple of 5.
7266 Initialise(&masm,
7267 p3,
7268 0xeee10000000001ee,
7269 0xeeeeeee100000000,
7270 0x01eeeeeeeee10000,
7271 0x000001eeeeeeeee1);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007272 __ St2d(z18.VnD(), z19.VnD(), p3, SVEMemOperand(x0, 14, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007273
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007274 // We can test ld2 by comparing the values loaded with the values stored.
7275 // There are two complications:
7276 // - Loads have zeroing predication, so we have to clear the inactive
7277 // elements on our reference.
7278 // - We want to test both loads and stores that span { z31, z0 }, so we have
7279 // to move some values around.
7280 //
7281 // Registers z4-z11 will hold as-stored values (with inactive elements
7282 // cleared). Registers z20-z27 will hold the values that were loaded.
7283
7284 // Ld2b(z14.VnB(), z15.VnB(), ...)
7285 __ Dup(z4.VnB(), 0);
7286 __ Dup(z5.VnB(), 0);
7287 __ Mov(z4.VnB(), p0.Merging(), z14.VnB());
7288 __ Mov(z5.VnB(), p0.Merging(), z15.VnB());
7289
7290 // Ld2h(z16.VnH(), z17.VnH(), ...)
7291 __ Dup(z6.VnH(), 0);
7292 __ Dup(z7.VnH(), 0);
7293 __ Mov(z6.VnH(), p1.Merging(), z16.VnH());
7294 __ Mov(z7.VnH(), p1.Merging(), z17.VnH());
7295
7296 // Ld2w(z31.VnS(), z0.VnS(), ...)
7297 __ Dup(z8.VnS(), 0);
7298 __ Dup(z9.VnS(), 0);
7299 __ Mov(z8.VnS(), p2.Merging(), z31.VnS());
7300 __ Mov(z9.VnS(), p2.Merging(), z0.VnS());
7301
7302 // Ld2d(z18.VnD(), z19.VnD(), ...)
7303 __ Dup(z10.VnD(), 0);
7304 __ Dup(z11.VnD(), 0);
7305 __ Mov(z10.VnD(), p3.Merging(), z18.VnD());
7306 __ Mov(z11.VnD(), p3.Merging(), z19.VnD());
7307
7308 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7309 __ Ld2b(z31.VnB(), z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7310 __ Mov(z20, z31);
7311 __ Mov(z21, z0);
7312
7313 __ Ld2h(z22.VnH(), z23.VnH(), p1.Zeroing(), SVEMemOperand(x0, 8, SVE_MUL_VL));
7314 __ Ld2w(z24.VnS(),
7315 z25.VnS(),
7316 p2.Zeroing(),
7317 SVEMemOperand(x0, -12, SVE_MUL_VL));
7318 __ Ld2d(z26.VnD(),
7319 z27.VnD(),
7320 p3.Zeroing(),
7321 SVEMemOperand(x0, 14, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007322
7323 END();
7324
7325 if (CAN_RUN()) {
7326 RUN();
7327
7328 uint8_t* expected = new uint8_t[data_size];
7329 memset(expected, 0, data_size);
7330 uint8_t* middle = &expected[data_size / 2];
7331
7332 int vl_b = vl / kBRegSizeInBytes;
7333 int vl_h = vl / kHRegSizeInBytes;
7334 int vl_s = vl / kSRegSizeInBytes;
7335 int vl_d = vl / kDRegSizeInBytes;
7336
7337 int reg_count = 2;
7338
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007339 // st2b { z14.b, z15.b }, SVE_ALL
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007340 for (int i = 0; i < vl_b; i++) {
7341 uint8_t lane0 = 1 - (3 * i);
7342 uint8_t lane1 = 2 - (3 * i);
7343 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7344 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7345 }
7346
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007347 // st2h { z16.h, z17.h }, SVE_MUL3
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007348 int vl_h_mul3 = vl_h - (vl_h % 3);
7349 for (int i = 0; i < vl_h_mul3; i++) {
7350 int64_t offset = 8 * vl;
7351 uint16_t lane0 = -2 + (5 * i);
7352 uint16_t lane1 = -3 + (5 * i);
7353 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7354 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7355 }
7356
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007357 // st2w { z31.s, z0.s }, SVE_POW2
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007358 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7359 for (int i = 0; i < vl_s_pow2; i++) {
7360 int64_t offset = -12 * vl;
7361 uint32_t lane0 = 3 - (7 * i);
7362 uint32_t lane1 = 4 - (7 * i);
7363 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7364 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7365 }
7366
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007367 // st2d { z18.d, z19.d }, ((i % 5) == 0)
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007368 for (int i = 0; i < vl_d; i++) {
7369 if ((i % 5) == 0) {
7370 int64_t offset = 14 * vl;
7371 uint64_t lane0 = -7 + (3 * i);
7372 uint64_t lane1 = -8 + (3 * i);
7373 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7374 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7375 }
7376 }
7377
7378 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7379
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007380 // Check that we loaded back the expected values.
7381
7382 // st2b/ld2b
7383 ASSERT_EQUAL_SVE(z4, z20);
7384 ASSERT_EQUAL_SVE(z5, z21);
7385
7386 // st2h/ld2h
7387 ASSERT_EQUAL_SVE(z6, z22);
7388 ASSERT_EQUAL_SVE(z7, z23);
7389
7390 // st2w/ld2w
7391 ASSERT_EQUAL_SVE(z8, z24);
7392 ASSERT_EQUAL_SVE(z9, z25);
7393
7394 // st2d/ld2d
7395 ASSERT_EQUAL_SVE(z10, z26);
7396 ASSERT_EQUAL_SVE(z11, z27);
7397
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007398 delete[] expected;
7399 }
7400 delete[] data;
7401}
7402
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007403TEST_SVE(sve_ld2_st2_scalar_plus_scalar) {
7404 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7405 START();
7406
7407 int vl = config->sve_vl_in_bytes();
7408
7409 // Allocate plenty of space to enable indexing in both directions.
7410 int data_size = vl * 128;
7411
7412 uint8_t* data = new uint8_t[data_size];
7413 memset(data, 0, data_size);
7414
7415 // Set the base half-way through the buffer so we can use negative indeces.
7416 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7417
Jacob Bramleye483ce52019-11-05 16:52:29 +00007418 __ Index(z10.VnB(), -4, 11);
7419 __ Index(z11.VnB(), -5, 11);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007420 __ Ptrue(p7.VnB(), SVE_MUL4);
7421 __ Mov(x1, 0);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007422 __ St2b(z10.VnB(), z11.VnB(), p7, SVEMemOperand(x0, x1));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007423
Jacob Bramleye483ce52019-11-05 16:52:29 +00007424 __ Index(z12.VnH(), 6, -2);
7425 __ Index(z13.VnH(), 7, -2);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007426 __ Ptrue(p6.VnH(), SVE_VL16);
7427 __ Rdvl(x2, 3); // Make offsets VL-dependent so we can avoid overlap.
Jacob Bramleye483ce52019-11-05 16:52:29 +00007428 __ St2h(z12.VnH(), z13.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007429
Jacob Bramleye483ce52019-11-05 16:52:29 +00007430 __ Index(z14.VnS(), -7, 3);
7431 __ Index(z15.VnS(), -8, 3);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007432 // Sparse predication, including some irrelevant bits (0xe). To make the
7433 // results easy to check, activate each lane <n> where n is a multiple of 5.
7434 Initialise(&masm,
7435 p5,
7436 0xeee1000010000100,
7437 0x001eeee100001000,
7438 0x0100001eeee10000,
7439 0x10000100001eeee1);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007440 __ Rdvl(x3, -3);
7441 __ St2w(z14.VnS(), z15.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007442
7443 // Wrap around from z31 to z0.
7444 __ Index(z31.VnD(), 32, -11);
7445 __ Index(z0.VnD(), 33, -11);
7446 __ Ptrue(p4.VnD(), SVE_MUL3);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007447 __ Rdvl(x4, 1);
7448 __ St2d(z31.VnD(), z0.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007449
Jacob Bramleye483ce52019-11-05 16:52:29 +00007450 // We can test ld2 by comparing the values loaded with the values stored.
7451 // There are two complications:
7452 // - Loads have zeroing predication, so we have to clear the inactive
7453 // elements on our reference.
7454 // - We want to test both loads and stores that span { z31, z0 }, so we have
7455 // to move some values around.
7456 //
7457 // Registers z4-z11 will hold as-stored values (with inactive elements
7458 // cleared). Registers z20-z27 will hold the values that were loaded.
7459
7460 // Ld2b(z20.VnB(), z21.VnB(), ...)
7461 __ Dup(z4.VnB(), 0);
7462 __ Dup(z5.VnB(), 0);
7463 __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7464 __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7465
7466 // Ld2h(z22.VnH(), z23.VnH(), ...)
7467 __ Dup(z6.VnH(), 0);
7468 __ Dup(z7.VnH(), 0);
7469 __ Mov(z6.VnH(), p6.Merging(), z12.VnH());
7470 __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7471
7472 // Ld2w(z24.VnS(), z25.VnS(), ...)
7473 __ Dup(z8.VnS(), 0);
7474 __ Dup(z9.VnS(), 0);
7475 __ Mov(z8.VnS(), p5.Merging(), z14.VnS());
7476 __ Mov(z9.VnS(), p5.Merging(), z15.VnS());
7477
7478 // Ld2d(z31.VnD(), z0.VnD(), ...)
7479 __ Dup(z10.VnD(), 0);
7480 __ Dup(z11.VnD(), 0);
7481 __ Mov(z10.VnD(), p4.Merging(), z31.VnD());
7482 __ Mov(z11.VnD(), p4.Merging(), z0.VnD());
7483
7484 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7485 __ Ld2b(z31.VnB(), z0.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1));
7486 __ Mov(z20, z31);
7487 __ Mov(z21, z0);
7488
7489 __ Ld2h(z22.VnH(), z23.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
7490 __ Ld2w(z24.VnS(), z25.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2));
7491 __ Ld2d(z26.VnD(), z27.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007492
7493 END();
7494
7495 if (CAN_RUN()) {
7496 RUN();
7497
7498 uint8_t* expected = new uint8_t[data_size];
7499 memset(expected, 0, data_size);
7500 uint8_t* middle = &expected[data_size / 2];
7501
7502 int vl_b = vl / kBRegSizeInBytes;
7503 int vl_h = vl / kHRegSizeInBytes;
7504 int vl_s = vl / kSRegSizeInBytes;
7505 int vl_d = vl / kDRegSizeInBytes;
7506
7507 int reg_count = 2;
7508
Jacob Bramleye483ce52019-11-05 16:52:29 +00007509 // st2b { z10.b, z11.b }, SVE_MUL4
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007510 int vl_b_mul4 = vl_b - (vl_b % 4);
7511 for (int i = 0; i < vl_b_mul4; i++) {
7512 uint8_t lane0 = -4 + (11 * i);
7513 uint8_t lane1 = -5 + (11 * i);
7514 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7515 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7516 }
7517
Jacob Bramleye483ce52019-11-05 16:52:29 +00007518 // st2h { z12.h, z13.h }, SVE_VL16
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007519 if (vl_h >= 16) {
7520 for (int i = 0; i < 16; i++) {
7521 int64_t offset = (3 << kHRegSizeInBytesLog2) * vl;
7522 uint16_t lane0 = 6 - (2 * i);
7523 uint16_t lane1 = 7 - (2 * i);
7524 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7525 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7526 }
7527 }
7528
Jacob Bramleye483ce52019-11-05 16:52:29 +00007529 // st2w { z14.s, z15.s }, ((i % 5) == 0)
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007530 for (int i = 0; i < vl_s; i++) {
7531 if ((i % 5) == 0) {
7532 int64_t offset = -(3 << kSRegSizeInBytesLog2) * vl;
7533 uint32_t lane0 = -7 + (3 * i);
7534 uint32_t lane1 = -8 + (3 * i);
7535 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7536 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7537 }
7538 }
7539
7540 // st2d { z31.b, z0.b }, SVE_MUL3
7541 int vl_d_mul3 = vl_d - (vl_d % 3);
7542 for (int i = 0; i < vl_d_mul3; i++) {
7543 int64_t offset = (1 << kDRegSizeInBytesLog2) * vl;
7544 uint64_t lane0 = 32 - (11 * i);
7545 uint64_t lane1 = 33 - (11 * i);
7546 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7547 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7548 }
7549
7550 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7551
Jacob Bramleye483ce52019-11-05 16:52:29 +00007552 // Check that we loaded back the expected values.
7553
7554 // st2b/ld2b
7555 ASSERT_EQUAL_SVE(z4, z20);
7556 ASSERT_EQUAL_SVE(z5, z21);
7557
7558 // st2h/ld2h
7559 ASSERT_EQUAL_SVE(z6, z22);
7560 ASSERT_EQUAL_SVE(z7, z23);
7561
7562 // st2w/ld2w
7563 ASSERT_EQUAL_SVE(z8, z24);
7564 ASSERT_EQUAL_SVE(z9, z25);
7565
7566 // st2d/ld2d
7567 ASSERT_EQUAL_SVE(z10, z26);
7568 ASSERT_EQUAL_SVE(z11, z27);
7569
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007570 delete[] expected;
7571 }
7572 delete[] data;
7573}
7574
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007575TEST_SVE(sve_ld3_st3_scalar_plus_imm) {
7576 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7577 START();
7578
7579 int vl = config->sve_vl_in_bytes();
7580
7581 // The immediate can address [-24, 21] times the VL, so allocate enough space
7582 // to exceed that in both directions.
7583 int data_size = vl * 128;
7584
7585 uint8_t* data = new uint8_t[data_size];
7586 memset(data, 0, data_size);
7587
7588 // Set the base half-way through the buffer so we can use negative indeces.
7589 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7590
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007591 // We can test ld3 by comparing the values loaded with the values stored.
7592 // There are two complications:
7593 // - Loads have zeroing predication, so we have to clear the inactive
7594 // elements on our reference.
7595 // - We want to test both loads and stores that span { z31, z0 }, so we have
7596 // to move some values around.
7597 //
7598 // Registers z4-z15 will hold as-stored values (with inactive elements
7599 // cleared). Registers z16-z27 will hold the values that were loaded.
7600
7601 __ Index(z10.VnB(), 1, -3);
7602 __ Index(z11.VnB(), 2, -3);
7603 __ Index(z12.VnB(), 3, -3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007604 __ Ptrue(p0.VnB());
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007605 __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p0, SVEMemOperand(x0));
7606 // Save the stored values for ld3 tests.
7607 __ Dup(z4.VnB(), 0);
7608 __ Dup(z5.VnB(), 0);
7609 __ Dup(z6.VnB(), 0);
7610 __ Mov(z4.VnB(), p0.Merging(), z10.VnB());
7611 __ Mov(z5.VnB(), p0.Merging(), z11.VnB());
7612 __ Mov(z6.VnB(), p0.Merging(), z12.VnB());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007613
7614 // Wrap around from z31 to z0.
7615 __ Index(z31.VnH(), -2, 5);
7616 __ Index(z0.VnH(), -3, 5);
7617 __ Index(z1.VnH(), -4, 5);
7618 __ Ptrue(p1.VnH(), SVE_MUL3);
7619 __ St3h(z31.VnH(), z0.VnH(), z1.VnH(), p1, SVEMemOperand(x0, 9, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007620 // Save the stored values for ld3 tests.
7621 __ Dup(z7.VnH(), 0);
7622 __ Dup(z8.VnH(), 0);
7623 __ Dup(z9.VnH(), 0);
7624 __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
7625 __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
7626 __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007627
7628 __ Index(z30.VnS(), 3, -7);
7629 __ Index(z31.VnS(), 4, -7);
7630 __ Index(z0.VnS(), 5, -7);
7631 __ Ptrue(p2.VnS(), SVE_POW2);
7632 __ St3w(z30.VnS(),
7633 z31.VnS(),
7634 z0.VnS(),
7635 p2,
7636 SVEMemOperand(x0, -12, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007637 // Save the stored values for ld3 tests.
7638 __ Dup(z10.VnS(), 0);
7639 __ Dup(z11.VnS(), 0);
7640 __ Dup(z12.VnS(), 0);
7641 __ Mov(z10.VnS(), p2.Merging(), z30.VnS());
7642 __ Mov(z11.VnS(), p2.Merging(), z31.VnS());
7643 __ Mov(z12.VnS(), p2.Merging(), z0.VnS());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007644
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007645 __ Index(z0.VnD(), -7, 3);
7646 __ Index(z1.VnD(), -8, 3);
7647 __ Index(z2.VnD(), -9, 3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007648 // Sparse predication, including some irrelevant bits (0xee). To make the
7649 // results easy to check, activate each lane <n> where n is a multiple of 5.
7650 Initialise(&masm,
7651 p3,
7652 0xeee10000000001ee,
7653 0xeeeeeee100000000,
7654 0x01eeeeeeeee10000,
7655 0x000001eeeeeeeee1);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007656 __ St3d(z0.VnD(), z1.VnD(), z2.VnD(), p3, SVEMemOperand(x0, 15, SVE_MUL_VL));
7657 // Save the stored values for ld3 tests.
7658 __ Dup(z13.VnD(), 0);
7659 __ Dup(z14.VnD(), 0);
7660 __ Dup(z15.VnD(), 0);
7661 __ Mov(z13.VnD(), p3.Merging(), z0.VnD());
7662 __ Mov(z14.VnD(), p3.Merging(), z1.VnD());
7663 __ Mov(z15.VnD(), p3.Merging(), z2.VnD());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007664
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007665 // Corresponding loads.
7666 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7667 __ Ld3b(z31.VnB(), z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7668 __ Mov(z16, z31);
7669 __ Mov(z17, z0);
7670 __ Mov(z18, z1);
7671 __ Ld3h(z30.VnH(),
7672 z31.VnH(),
7673 z0.VnH(),
7674 p1.Zeroing(),
7675 SVEMemOperand(x0, 9, SVE_MUL_VL));
7676 __ Mov(z19, z30);
7677 __ Mov(z20, z31);
7678 __ Mov(z21, z0);
7679 __ Ld3w(z22.VnS(),
7680 z23.VnS(),
7681 z24.VnS(),
7682 p2.Zeroing(),
7683 SVEMemOperand(x0, -12, SVE_MUL_VL));
7684 __ Ld3d(z25.VnD(),
7685 z26.VnD(),
7686 z27.VnD(),
7687 p3.Zeroing(),
7688 SVEMemOperand(x0, 15, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007689
7690 END();
7691
7692 if (CAN_RUN()) {
7693 RUN();
7694
7695 uint8_t* expected = new uint8_t[data_size];
7696 memset(expected, 0, data_size);
7697 uint8_t* middle = &expected[data_size / 2];
7698
7699 int vl_b = vl / kBRegSizeInBytes;
7700 int vl_h = vl / kHRegSizeInBytes;
7701 int vl_s = vl / kSRegSizeInBytes;
7702 int vl_d = vl / kDRegSizeInBytes;
7703
7704 int reg_count = 3;
7705
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007706 // st3b { z10.b, z11.b, z12.b }, SVE_ALL
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007707 for (int i = 0; i < vl_b; i++) {
7708 uint8_t lane0 = 1 - (3 * i);
7709 uint8_t lane1 = 2 - (3 * i);
7710 uint8_t lane2 = 3 - (3 * i);
7711 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7712 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7713 MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
7714 }
7715
7716 // st3h { z31.h, z0.h, z1.h }, SVE_MUL3
7717 int vl_h_mul3 = vl_h - (vl_h % 3);
7718 for (int i = 0; i < vl_h_mul3; i++) {
7719 int64_t offset = 9 * vl;
7720 uint16_t lane0 = -2 + (5 * i);
7721 uint16_t lane1 = -3 + (5 * i);
7722 uint16_t lane2 = -4 + (5 * i);
7723 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7724 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7725 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7726 }
7727
7728 // st3w { z30.s, z31.s, z0.s }, SVE_POW2
7729 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7730 for (int i = 0; i < vl_s_pow2; i++) {
7731 int64_t offset = -12 * vl;
7732 uint32_t lane0 = 3 - (7 * i);
7733 uint32_t lane1 = 4 - (7 * i);
7734 uint32_t lane2 = 5 - (7 * i);
7735 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7736 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7737 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7738 }
7739
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007740 // st3d { z0.d, z1.d, z2.d }, ((i % 5) == 0)
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007741 for (int i = 0; i < vl_d; i++) {
7742 if ((i % 5) == 0) {
7743 int64_t offset = 15 * vl;
7744 uint64_t lane0 = -7 + (3 * i);
7745 uint64_t lane1 = -8 + (3 * i);
7746 uint64_t lane2 = -9 + (3 * i);
7747 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7748 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7749 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7750 }
7751 }
7752
7753 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7754
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007755 // Check that we loaded back the expected values.
7756
7757 // st3b/ld3b
7758 ASSERT_EQUAL_SVE(z4, z16);
7759 ASSERT_EQUAL_SVE(z5, z17);
7760 ASSERT_EQUAL_SVE(z6, z18);
7761
7762 // st3h/ld3h
7763 ASSERT_EQUAL_SVE(z7, z19);
7764 ASSERT_EQUAL_SVE(z8, z20);
7765 ASSERT_EQUAL_SVE(z9, z21);
7766
7767 // st3w/ld3w
7768 ASSERT_EQUAL_SVE(z10, z22);
7769 ASSERT_EQUAL_SVE(z11, z23);
7770 ASSERT_EQUAL_SVE(z12, z24);
7771
7772 // st3d/ld3d
7773 ASSERT_EQUAL_SVE(z13, z25);
7774 ASSERT_EQUAL_SVE(z14, z26);
7775 ASSERT_EQUAL_SVE(z15, z27);
7776
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007777 delete[] expected;
7778 }
7779 delete[] data;
7780}
7781
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007782TEST_SVE(sve_ld3_st3_scalar_plus_scalar) {
7783 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7784 START();
7785
7786 int vl = config->sve_vl_in_bytes();
7787
7788 // Allocate plenty of space to enable indexing in both directions.
7789 int data_size = vl * 128;
7790
7791 uint8_t* data = new uint8_t[data_size];
7792 memset(data, 0, data_size);
7793
7794 // Set the base half-way through the buffer so we can use negative indeces.
7795 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7796
Jacob Bramleye483ce52019-11-05 16:52:29 +00007797 // We can test ld3 by comparing the values loaded with the values stored.
7798 // There are two complications:
7799 // - Loads have zeroing predication, so we have to clear the inactive
7800 // elements on our reference.
7801 // - We want to test both loads and stores that span { z31, z0 }, so we have
7802 // to move some values around.
7803 //
7804 // Registers z4-z15 will hold as-stored values (with inactive elements
7805 // cleared). Registers z16-z27 will hold the values that were loaded.
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007806
Jacob Bramleye483ce52019-11-05 16:52:29 +00007807 __ Index(z10.VnB(), -4, 11);
7808 __ Index(z11.VnB(), -5, 11);
7809 __ Index(z12.VnB(), -6, 11);
7810 __ Ptrue(p7.VnB(), SVE_MUL4);
7811 __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
7812 __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p7, SVEMemOperand(x0, x1, LSL, 0));
7813 // Save the stored values for ld3 tests.
7814 __ Dup(z4.VnB(), 0);
7815 __ Dup(z5.VnB(), 0);
7816 __ Dup(z6.VnB(), 0);
7817 __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7818 __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7819 __ Mov(z6.VnB(), p7.Merging(), z12.VnB());
7820
7821 __ Index(z13.VnH(), 6, -2);
7822 __ Index(z14.VnH(), 7, -2);
7823 __ Index(z15.VnH(), 8, -2);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007824 __ Ptrue(p6.VnH(), SVE_VL16);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007825 __ Rdvl(x2, 5); // (5 * vl) << 1 = 10 * vl
7826 __ St3h(z13.VnH(), z14.VnH(), z15.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
7827 // Save the stored values for ld3 tests.
7828 __ Dup(z7.VnH(), 0);
7829 __ Dup(z8.VnH(), 0);
7830 __ Dup(z9.VnH(), 0);
7831 __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7832 __ Mov(z8.VnH(), p6.Merging(), z14.VnH());
7833 __ Mov(z9.VnH(), p6.Merging(), z15.VnH());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007834
7835 // Wrap around from z31 to z0.
7836 __ Index(z30.VnS(), -7, 3);
7837 __ Index(z31.VnS(), -8, 3);
7838 __ Index(z0.VnS(), -9, 3);
7839 // Sparse predication, including some irrelevant bits (0xe). To make the
7840 // results easy to check, activate each lane <n> where n is a multiple of 5.
7841 Initialise(&masm,
7842 p5,
7843 0xeee1000010000100,
7844 0x001eeee100001000,
7845 0x0100001eeee10000,
7846 0x10000100001eeee1);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007847 __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
7848 __ St3w(z30.VnS(), z31.VnS(), z0.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
7849 // Save the stored values for ld3 tests.
7850 __ Dup(z10.VnS(), 0);
7851 __ Dup(z11.VnS(), 0);
7852 __ Dup(z12.VnS(), 0);
7853 __ Mov(z10.VnS(), p5.Merging(), z30.VnS());
7854 __ Mov(z11.VnS(), p5.Merging(), z31.VnS());
7855 __ Mov(z12.VnS(), p5.Merging(), z0.VnS());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007856
7857 __ Index(z31.VnD(), 32, -11);
7858 __ Index(z0.VnD(), 33, -11);
7859 __ Index(z1.VnD(), 34, -11);
7860 __ Ptrue(p4.VnD(), SVE_MUL3);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007861 __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 * vl
7862 __ St3d(z31.VnD(), z0.VnD(), z1.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
7863 // Save the stored values for ld3 tests.
7864 __ Dup(z13.VnD(), 0);
7865 __ Dup(z14.VnD(), 0);
7866 __ Dup(z15.VnD(), 0);
7867 __ Mov(z13.VnD(), p4.Merging(), z31.VnD());
7868 __ Mov(z14.VnD(), p4.Merging(), z0.VnD());
7869 __ Mov(z15.VnD(), p4.Merging(), z1.VnD());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007870
Jacob Bramleye483ce52019-11-05 16:52:29 +00007871 // Corresponding loads.
7872 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7873 __ Ld3b(z31.VnB(),
7874 z0.VnB(),
7875 z1.VnB(),
7876 p7.Zeroing(),
7877 SVEMemOperand(x0, x1, LSL, 0));
7878 __ Mov(z16, z31);
7879 __ Mov(z17, z0);
7880 __ Mov(z18, z1);
7881 __ Ld3h(z30.VnH(),
7882 z31.VnH(),
7883 z0.VnH(),
7884 p6.Zeroing(),
7885 SVEMemOperand(x0, x2, LSL, 1));
7886 __ Mov(z19, z30);
7887 __ Mov(z20, z31);
7888 __ Mov(z21, z0);
7889 __ Ld3w(z22.VnS(),
7890 z23.VnS(),
7891 z24.VnS(),
7892 p5.Zeroing(),
7893 SVEMemOperand(x0, x3, LSL, 2));
7894 __ Ld3d(z25.VnD(),
7895 z26.VnD(),
7896 z27.VnD(),
7897 p4.Zeroing(),
7898 SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007899
7900 END();
7901
7902 if (CAN_RUN()) {
7903 RUN();
7904
7905 uint8_t* expected = new uint8_t[data_size];
7906 memset(expected, 0, data_size);
7907 uint8_t* middle = &expected[data_size / 2];
7908
7909 int vl_b = vl / kBRegSizeInBytes;
7910 int vl_h = vl / kHRegSizeInBytes;
7911 int vl_s = vl / kSRegSizeInBytes;
7912 int vl_d = vl / kDRegSizeInBytes;
7913
7914 int reg_count = 3;
7915
Jacob Bramleye483ce52019-11-05 16:52:29 +00007916 // st3b { z10.b, z11.b, z12.b }, SVE_MUL4
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007917 int vl_b_mul4 = vl_b - (vl_b % 4);
7918 for (int i = 0; i < vl_b_mul4; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00007919 int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007920 uint8_t lane0 = -4 + (11 * i);
7921 uint8_t lane1 = -5 + (11 * i);
7922 uint8_t lane2 = -6 + (11 * i);
7923 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7924 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7925 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7926 }
7927
Jacob Bramleye483ce52019-11-05 16:52:29 +00007928 // st3h { z13.h, z14.h, z15.h }, SVE_VL16
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007929 if (vl_h >= 16) {
7930 for (int i = 0; i < 16; i++) {
7931 int64_t offset = (5 << kHRegSizeInBytesLog2) * vl;
7932 uint16_t lane0 = 6 - (2 * i);
7933 uint16_t lane1 = 7 - (2 * i);
7934 uint16_t lane2 = 8 - (2 * i);
7935 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7936 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7937 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7938 }
7939 }
7940
7941 // st3w { z30.s, z31.s, z0.s }, ((i % 5) == 0)
7942 for (int i = 0; i < vl_s; i++) {
7943 if ((i % 5) == 0) {
7944 int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
7945 uint32_t lane0 = -7 + (3 * i);
7946 uint32_t lane1 = -8 + (3 * i);
7947 uint32_t lane2 = -9 + (3 * i);
7948 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7949 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7950 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7951 }
7952 }
7953
7954 // st3d { z31.d, z0.d, z1.d }, SVE_MUL3
7955 int vl_d_mul3 = vl_d - (vl_d % 3);
7956 for (int i = 0; i < vl_d_mul3; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00007957 int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007958 uint64_t lane0 = 32 - (11 * i);
7959 uint64_t lane1 = 33 - (11 * i);
7960 uint64_t lane2 = 34 - (11 * i);
7961 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7962 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7963 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7964 }
7965
7966 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7967
Jacob Bramleye483ce52019-11-05 16:52:29 +00007968 // Check that we loaded back the expected values.
7969
7970 // st3b/ld3b
7971 ASSERT_EQUAL_SVE(z4, z16);
7972 ASSERT_EQUAL_SVE(z5, z17);
7973 ASSERT_EQUAL_SVE(z6, z18);
7974
7975 // st3h/ld3h
7976 ASSERT_EQUAL_SVE(z7, z19);
7977 ASSERT_EQUAL_SVE(z8, z20);
7978 ASSERT_EQUAL_SVE(z9, z21);
7979
7980 // st3w/ld3w
7981 ASSERT_EQUAL_SVE(z10, z22);
7982 ASSERT_EQUAL_SVE(z11, z23);
7983 ASSERT_EQUAL_SVE(z12, z24);
7984
7985 // st3d/ld3d
7986 ASSERT_EQUAL_SVE(z13, z25);
7987 ASSERT_EQUAL_SVE(z14, z26);
7988 ASSERT_EQUAL_SVE(z15, z27);
7989
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007990 delete[] expected;
7991 }
7992 delete[] data;
7993}
7994
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007995TEST_SVE(sve_ld4_st4_scalar_plus_imm) {
7996 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7997 START();
7998
7999 int vl = config->sve_vl_in_bytes();
8000
8001 // The immediate can address [-24, 21] times the VL, so allocate enough space
8002 // to exceed that in both directions.
8003 int data_size = vl * 128;
8004
8005 uint8_t* data = new uint8_t[data_size];
8006 memset(data, 0, data_size);
8007
8008 // Set the base half-way through the buffer so we can use negative indeces.
8009 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8010
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008011 // We can test ld4 by comparing the values loaded with the values stored.
8012 // There are two complications:
8013 // - Loads have zeroing predication, so we have to clear the inactive
8014 // elements on our reference.
8015 // - We want to test both loads and stores that span { z31, z0 }, so we have
8016 // to move some values around.
8017 //
8018 // Registers z3-z18 will hold as-stored values (with inactive elements
8019 // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8020 // loaded.
8021
8022 __ Index(z10.VnB(), 1, -7);
8023 __ Index(z11.VnB(), 2, -7);
8024 __ Index(z12.VnB(), 3, -7);
8025 __ Index(z13.VnB(), 4, -7);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008026 __ Ptrue(p0.VnB());
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008027 __ St4b(z10.VnB(), z11.VnB(), z12.VnB(), z13.VnB(), p0, SVEMemOperand(x0));
8028 // Save the stored values for ld4 tests.
8029 __ Dup(z3.VnB(), 0);
8030 __ Dup(z4.VnB(), 0);
8031 __ Dup(z5.VnB(), 0);
8032 __ Dup(z6.VnB(), 0);
8033 __ Mov(z3.VnB(), p0.Merging(), z10.VnB());
8034 __ Mov(z4.VnB(), p0.Merging(), z11.VnB());
8035 __ Mov(z5.VnB(), p0.Merging(), z12.VnB());
8036 __ Mov(z6.VnB(), p0.Merging(), z13.VnB());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008037
8038 // Wrap around from z31 to z0.
8039 __ Index(z31.VnH(), -2, 5);
8040 __ Index(z0.VnH(), -3, 5);
8041 __ Index(z1.VnH(), -4, 5);
8042 __ Index(z2.VnH(), -5, 5);
8043 __ Ptrue(p1.VnH(), SVE_MUL3);
8044 __ St4h(z31.VnH(),
8045 z0.VnH(),
8046 z1.VnH(),
8047 z2.VnH(),
8048 p1,
8049 SVEMemOperand(x0, 4, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008050 // Save the stored values for ld4 tests.
8051 __ Dup(z7.VnH(), 0);
8052 __ Dup(z8.VnH(), 0);
8053 __ Dup(z9.VnH(), 0);
8054 __ Dup(z10.VnH(), 0);
8055 __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
8056 __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
8057 __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
8058 __ Mov(z10.VnH(), p1.Merging(), z2.VnH());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008059
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008060 // Wrap around from z31 to z0.
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008061 __ Index(z29.VnS(), 2, -7);
8062 __ Index(z30.VnS(), 3, -7);
8063 __ Index(z31.VnS(), 4, -7);
8064 __ Index(z0.VnS(), 5, -7);
8065 __ Ptrue(p2.VnS(), SVE_POW2);
8066 __ St4w(z29.VnS(),
8067 z30.VnS(),
8068 z31.VnS(),
8069 z0.VnS(),
8070 p2,
8071 SVEMemOperand(x0, -12, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008072 // Save the stored values for ld4 tests.
8073 __ Dup(z11.VnS(), 0);
8074 __ Dup(z12.VnS(), 0);
8075 __ Dup(z13.VnS(), 0);
8076 __ Dup(z14.VnS(), 0);
8077 __ Mov(z11.VnS(), p2.Merging(), z29.VnS());
8078 __ Mov(z12.VnS(), p2.Merging(), z30.VnS());
8079 __ Mov(z13.VnS(), p2.Merging(), z31.VnS());
8080 __ Mov(z14.VnS(), p2.Merging(), z0.VnS());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008081
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008082 __ Index(z20.VnD(), -7, 8);
8083 __ Index(z21.VnD(), -8, 8);
8084 __ Index(z22.VnD(), -9, 8);
8085 __ Index(z23.VnD(), -10, 8);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008086 // Sparse predication, including some irrelevant bits (0xee). To make the
8087 // results easy to check, activate each lane <n> where n is a multiple of 5.
8088 Initialise(&masm,
8089 p3,
8090 0xeee10000000001ee,
8091 0xeeeeeee100000000,
8092 0x01eeeeeeeee10000,
8093 0x000001eeeeeeeee1);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008094 __ St4d(z20.VnD(),
8095 z21.VnD(),
8096 z22.VnD(),
8097 z23.VnD(),
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008098 p3,
8099 SVEMemOperand(x0, 16, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008100 // Save the stored values for ld4 tests.
8101 __ Dup(z15.VnD(), 0);
8102 __ Dup(z16.VnD(), 0);
8103 __ Dup(z17.VnD(), 0);
8104 __ Dup(z18.VnD(), 0);
8105 __ Mov(z15.VnD(), p3.Merging(), z20.VnD());
8106 __ Mov(z16.VnD(), p3.Merging(), z21.VnD());
8107 __ Mov(z17.VnD(), p3.Merging(), z22.VnD());
8108 __ Mov(z18.VnD(), p3.Merging(), z23.VnD());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008109
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008110 // Corresponding loads.
8111 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8112 __ Ld4b(z31.VnB(),
8113 z0.VnB(),
8114 z1.VnB(),
8115 z2.VnB(),
8116 p0.Zeroing(),
8117 SVEMemOperand(x0));
8118 __ Mov(z19, z31);
8119 __ Mov(z20, z0);
8120 __ Mov(z21, z1);
8121 __ Mov(z22, z2);
8122 __ Ld4h(z23.VnH(),
8123 z24.VnH(),
8124 z25.VnH(),
8125 z26.VnH(),
8126 p1.Zeroing(),
8127 SVEMemOperand(x0, 4, SVE_MUL_VL));
8128 __ Ld4w(z27.VnS(),
8129 z28.VnS(),
8130 z29.VnS(),
8131 z30.VnS(),
8132 p2.Zeroing(),
8133 SVEMemOperand(x0, -12, SVE_MUL_VL));
8134 // Wrap around from z31 to z0.
8135 __ Ld4d(z31.VnD(),
8136 z0.VnD(),
8137 z1.VnD(),
8138 z2.VnD(),
8139 p3.Zeroing(),
8140 SVEMemOperand(x0, 16, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008141
8142 END();
8143
8144 if (CAN_RUN()) {
8145 RUN();
8146
8147 uint8_t* expected = new uint8_t[data_size];
8148 memset(expected, 0, data_size);
8149 uint8_t* middle = &expected[data_size / 2];
8150
8151 int vl_b = vl / kBRegSizeInBytes;
8152 int vl_h = vl / kHRegSizeInBytes;
8153 int vl_s = vl / kSRegSizeInBytes;
8154 int vl_d = vl / kDRegSizeInBytes;
8155
8156 int reg_count = 4;
8157
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008158 // st2b { z10.b, z11.b, z12.b, z13.b }, SVE_ALL
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008159 for (int i = 0; i < vl_b; i++) {
8160 uint8_t lane0 = 1 - (7 * i);
8161 uint8_t lane1 = 2 - (7 * i);
8162 uint8_t lane2 = 3 - (7 * i);
8163 uint8_t lane3 = 4 - (7 * i);
8164 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
8165 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
8166 MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
8167 MemoryWrite(middle, 0, (i * reg_count) + 3, lane3);
8168 }
8169
8170 // st4h { z31.h, z0.h, z1.h, z2.h }, SVE_MUL3
8171 int vl_h_mul3 = vl_h - (vl_h % 3);
8172 for (int i = 0; i < vl_h_mul3; i++) {
8173 int64_t offset = 4 * vl;
8174 uint16_t lane0 = -2 + (5 * i);
8175 uint16_t lane1 = -3 + (5 * i);
8176 uint16_t lane2 = -4 + (5 * i);
8177 uint16_t lane3 = -5 + (5 * i);
8178 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8179 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8180 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8181 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8182 }
8183
8184 // st4w { z29.s, z30.s, z31.s, z0.s }, SVE_POW2
8185 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
8186 for (int i = 0; i < vl_s_pow2; i++) {
8187 int64_t offset = -12 * vl;
8188 uint32_t lane0 = 2 - (7 * i);
8189 uint32_t lane1 = 3 - (7 * i);
8190 uint32_t lane2 = 4 - (7 * i);
8191 uint32_t lane3 = 5 - (7 * i);
8192 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8193 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8194 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8195 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8196 }
8197
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008198 // st4d { z20.d, z21.d, z22.d, z23.d }, ((i % 5) == 0)
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008199 for (int i = 0; i < vl_d; i++) {
8200 if ((i % 5) == 0) {
8201 int64_t offset = 16 * vl;
8202 uint64_t lane0 = -7 + (8 * i);
8203 uint64_t lane1 = -8 + (8 * i);
8204 uint64_t lane2 = -9 + (8 * i);
8205 uint64_t lane3 = -10 + (8 * i);
8206 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8207 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8208 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8209 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8210 }
8211 }
8212
8213 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8214
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008215 // Check that we loaded back the expected values.
8216
8217 // st4b/ld4b
8218 ASSERT_EQUAL_SVE(z3, z19);
8219 ASSERT_EQUAL_SVE(z4, z20);
8220 ASSERT_EQUAL_SVE(z5, z21);
8221 ASSERT_EQUAL_SVE(z6, z22);
8222
8223 // st4h/ld4h
8224 ASSERT_EQUAL_SVE(z7, z23);
8225 ASSERT_EQUAL_SVE(z8, z24);
8226 ASSERT_EQUAL_SVE(z9, z25);
8227 ASSERT_EQUAL_SVE(z10, z26);
8228
8229 // st4w/ld4w
8230 ASSERT_EQUAL_SVE(z11, z27);
8231 ASSERT_EQUAL_SVE(z12, z28);
8232 ASSERT_EQUAL_SVE(z13, z29);
8233 ASSERT_EQUAL_SVE(z14, z30);
8234
8235 // st4d/ld4d
8236 ASSERT_EQUAL_SVE(z15, z31);
8237 ASSERT_EQUAL_SVE(z16, z0);
8238 ASSERT_EQUAL_SVE(z17, z1);
8239 ASSERT_EQUAL_SVE(z18, z2);
8240
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008241 delete[] expected;
8242 }
8243 delete[] data;
8244}
8245
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008246TEST_SVE(sve_ld4_st4_scalar_plus_scalar) {
8247 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8248 START();
8249
8250 int vl = config->sve_vl_in_bytes();
8251
8252 // Allocate plenty of space to enable indexing in both directions.
8253 int data_size = vl * 128;
8254
8255 uint8_t* data = new uint8_t[data_size];
8256 memset(data, 0, data_size);
8257
8258 // Set the base half-way through the buffer so we can use negative indeces.
8259 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8260
Jacob Bramleye483ce52019-11-05 16:52:29 +00008261 // We can test ld4 by comparing the values loaded with the values stored.
8262 // There are two complications:
8263 // - Loads have zeroing predication, so we have to clear the inactive
8264 // elements on our reference.
8265 // - We want to test both loads and stores that span { z31, z0 }, so we have
8266 // to move some values around.
8267 //
8268 // Registers z3-z18 will hold as-stored values (with inactive elements
8269 // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8270 // loaded.
8271
8272 __ Index(z19.VnB(), -4, 11);
8273 __ Index(z20.VnB(), -5, 11);
8274 __ Index(z21.VnB(), -6, 11);
8275 __ Index(z22.VnB(), -7, 11);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008276 __ Ptrue(p7.VnB(), SVE_MUL4);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008277 __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
8278 __ St4b(z19.VnB(),
8279 z20.VnB(),
8280 z21.VnB(),
8281 z22.VnB(),
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008282 p7,
8283 SVEMemOperand(x0, x1, LSL, 0));
Jacob Bramleye483ce52019-11-05 16:52:29 +00008284 // Save the stored values for ld4 tests.
8285 __ Dup(z3.VnB(), 0);
8286 __ Dup(z4.VnB(), 0);
8287 __ Dup(z5.VnB(), 0);
8288 __ Dup(z6.VnB(), 0);
8289 __ Mov(z3.VnB(), p7.Merging(), z19.VnB());
8290 __ Mov(z4.VnB(), p7.Merging(), z20.VnB());
8291 __ Mov(z5.VnB(), p7.Merging(), z21.VnB());
8292 __ Mov(z6.VnB(), p7.Merging(), z22.VnB());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008293
Jacob Bramleye483ce52019-11-05 16:52:29 +00008294 __ Index(z23.VnH(), 6, -2);
8295 __ Index(z24.VnH(), 7, -2);
8296 __ Index(z25.VnH(), 8, -2);
8297 __ Index(z26.VnH(), 9, -2);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008298 __ Ptrue(p6.VnH(), SVE_VL16);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008299 __ Rdvl(x2, 7); // (7 * vl) << 1 = 14 * vl
8300 __ St4h(z23.VnH(),
8301 z24.VnH(),
8302 z25.VnH(),
8303 z26.VnH(),
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008304 p6,
8305 SVEMemOperand(x0, x2, LSL, 1));
Jacob Bramleye483ce52019-11-05 16:52:29 +00008306 // Save the stored values for ld4 tests.
8307 __ Dup(z7.VnH(), 0);
8308 __ Dup(z8.VnH(), 0);
8309 __ Dup(z9.VnH(), 0);
8310 __ Dup(z10.VnH(), 0);
8311 __ Mov(z7.VnH(), p6.Merging(), z23.VnH());
8312 __ Mov(z8.VnH(), p6.Merging(), z24.VnH());
8313 __ Mov(z9.VnH(), p6.Merging(), z25.VnH());
8314 __ Mov(z10.VnH(), p6.Merging(), z26.VnH());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008315
8316 // Wrap around from z31 to z0.
8317 __ Index(z29.VnS(), -6, 7);
8318 __ Index(z30.VnS(), -7, 7);
8319 __ Index(z31.VnS(), -8, 7);
8320 __ Index(z0.VnS(), -9, 7);
8321 // Sparse predication, including some irrelevant bits (0xe). To make the
8322 // results easy to check, activate each lane <n> where n is a multiple of 5.
8323 Initialise(&masm,
8324 p5,
8325 0xeee1000010000100,
8326 0x001eeee100001000,
8327 0x0100001eeee10000,
8328 0x10000100001eeee1);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008329 __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008330 __ St4w(z29.VnS(),
8331 z30.VnS(),
8332 z31.VnS(),
8333 z0.VnS(),
8334 p5,
Jacob Bramleye483ce52019-11-05 16:52:29 +00008335 SVEMemOperand(x0, x3, LSL, 2));
8336 // Save the stored values for ld4 tests.
8337 __ Dup(z11.VnS(), 0);
8338 __ Dup(z12.VnS(), 0);
8339 __ Dup(z13.VnS(), 0);
8340 __ Dup(z14.VnS(), 0);
8341 __ Mov(z11.VnS(), p5.Merging(), z29.VnS());
8342 __ Mov(z12.VnS(), p5.Merging(), z30.VnS());
8343 __ Mov(z13.VnS(), p5.Merging(), z31.VnS());
8344 __ Mov(z14.VnS(), p5.Merging(), z0.VnS());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008345
8346 __ Index(z31.VnD(), 32, -11);
8347 __ Index(z0.VnD(), 33, -11);
8348 __ Index(z1.VnD(), 34, -11);
8349 __ Index(z2.VnD(), 35, -11);
8350 __ Ptrue(p4.VnD(), SVE_MUL3);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008351 __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 *vl
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008352 __ St4d(z31.VnD(),
8353 z0.VnD(),
8354 z1.VnD(),
8355 z2.VnD(),
8356 p4,
Jacob Bramleye483ce52019-11-05 16:52:29 +00008357 SVEMemOperand(x0, x4, LSL, 3));
8358 // Save the stored values for ld4 tests.
8359 __ Dup(z15.VnD(), 0);
8360 __ Dup(z16.VnD(), 0);
8361 __ Dup(z17.VnD(), 0);
8362 __ Dup(z18.VnD(), 0);
8363 __ Mov(z15.VnD(), p4.Merging(), z31.VnD());
8364 __ Mov(z16.VnD(), p4.Merging(), z0.VnD());
8365 __ Mov(z17.VnD(), p4.Merging(), z1.VnD());
8366 __ Mov(z18.VnD(), p4.Merging(), z2.VnD());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008367
Jacob Bramleye483ce52019-11-05 16:52:29 +00008368 // Corresponding loads.
8369 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8370 __ Ld4b(z31.VnB(),
8371 z0.VnB(),
8372 z1.VnB(),
8373 z2.VnB(),
8374 p7.Zeroing(),
8375 SVEMemOperand(x0, x1, LSL, 0));
8376 __ Mov(z19, z31);
8377 __ Mov(z20, z0);
8378 __ Mov(z21, z1);
8379 __ Mov(z22, z2);
8380 __ Ld4h(z23.VnH(),
8381 z24.VnH(),
8382 z25.VnH(),
8383 z26.VnH(),
8384 p6.Zeroing(),
8385 SVEMemOperand(x0, x2, LSL, 1));
8386 __ Ld4w(z27.VnS(),
8387 z28.VnS(),
8388 z29.VnS(),
8389 z30.VnS(),
8390 p5.Zeroing(),
8391 SVEMemOperand(x0, x3, LSL, 2));
8392 // Wrap around from z31 to z0.
8393 __ Ld4d(z31.VnD(),
8394 z0.VnD(),
8395 z1.VnD(),
8396 z2.VnD(),
8397 p4.Zeroing(),
8398 SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008399
8400 END();
8401
8402 if (CAN_RUN()) {
8403 RUN();
8404
8405 uint8_t* expected = new uint8_t[data_size];
8406 memset(expected, 0, data_size);
8407 uint8_t* middle = &expected[data_size / 2];
8408
8409 int vl_b = vl / kBRegSizeInBytes;
8410 int vl_h = vl / kHRegSizeInBytes;
8411 int vl_s = vl / kSRegSizeInBytes;
8412 int vl_d = vl / kDRegSizeInBytes;
8413
8414 int reg_count = 4;
8415
Jacob Bramleye483ce52019-11-05 16:52:29 +00008416 // st4b { z19.b, z20.b, z21.b, z22.b }, SVE_MUL4
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008417 int vl_b_mul4 = vl_b - (vl_b % 4);
8418 for (int i = 0; i < vl_b_mul4; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00008419 int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008420 uint8_t lane0 = -4 + (11 * i);
8421 uint8_t lane1 = -5 + (11 * i);
8422 uint8_t lane2 = -6 + (11 * i);
8423 uint8_t lane3 = -7 + (11 * i);
8424 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8425 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8426 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8427 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8428 }
8429
Jacob Bramleye483ce52019-11-05 16:52:29 +00008430 // st4h { z22.h, z23.h, z24.h, z25.h }, SVE_VL16
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008431 if (vl_h >= 16) {
8432 for (int i = 0; i < 16; i++) {
8433 int64_t offset = (7 << kHRegSizeInBytesLog2) * vl;
8434 uint16_t lane0 = 6 - (2 * i);
8435 uint16_t lane1 = 7 - (2 * i);
8436 uint16_t lane2 = 8 - (2 * i);
8437 uint16_t lane3 = 9 - (2 * i);
8438 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8439 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8440 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8441 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8442 }
8443 }
8444
8445 // st4w { z29.s, z30.s, z31.s, z0.s }, ((i % 5) == 0)
8446 for (int i = 0; i < vl_s; i++) {
8447 if ((i % 5) == 0) {
8448 int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
8449 uint32_t lane0 = -6 + (7 * i);
8450 uint32_t lane1 = -7 + (7 * i);
8451 uint32_t lane2 = -8 + (7 * i);
8452 uint32_t lane3 = -9 + (7 * i);
8453 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8454 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8455 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8456 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8457 }
8458 }
8459
8460 // st4d { z31.d, z0.d, z1.d, z2.d }, SVE_MUL3
8461 int vl_d_mul3 = vl_d - (vl_d % 3);
8462 for (int i = 0; i < vl_d_mul3; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00008463 int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008464 uint64_t lane0 = 32 - (11 * i);
8465 uint64_t lane1 = 33 - (11 * i);
8466 uint64_t lane2 = 34 - (11 * i);
8467 uint64_t lane3 = 35 - (11 * i);
8468 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8469 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8470 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8471 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8472 }
8473
8474 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8475
Jacob Bramleye483ce52019-11-05 16:52:29 +00008476 // Check that we loaded back the expected values.
8477
8478 // st4b/ld4b
8479 ASSERT_EQUAL_SVE(z3, z19);
8480 ASSERT_EQUAL_SVE(z4, z20);
8481 ASSERT_EQUAL_SVE(z5, z21);
8482 ASSERT_EQUAL_SVE(z6, z22);
8483
8484 // st4h/ld4h
8485 ASSERT_EQUAL_SVE(z7, z23);
8486 ASSERT_EQUAL_SVE(z8, z24);
8487 ASSERT_EQUAL_SVE(z9, z25);
8488 ASSERT_EQUAL_SVE(z10, z26);
8489
8490 // st4w/ld4w
8491 ASSERT_EQUAL_SVE(z11, z27);
8492 ASSERT_EQUAL_SVE(z12, z28);
8493 ASSERT_EQUAL_SVE(z13, z29);
8494 ASSERT_EQUAL_SVE(z14, z30);
8495
8496 // st4d/ld4d
8497 ASSERT_EQUAL_SVE(z15, z31);
8498 ASSERT_EQUAL_SVE(z16, z0);
8499 ASSERT_EQUAL_SVE(z17, z1);
8500 ASSERT_EQUAL_SVE(z18, z2);
8501
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008502 delete[] expected;
8503 }
8504 delete[] data;
8505}
8506
8507TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp) {
8508 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8509 START();
8510
8511 // Check that the simulator correctly interprets rn == 31 as sp.
8512 // The indexing logic is the same regardless so we just check one load and
8513 // store of each type.
8514
8515 // There are no pre- or post-indexing modes, so reserve space first.
8516 __ ClaimVL(2 + 3 + 4);
8517
8518 __ Index(z0.VnB(), 42, 2);
8519 __ Index(z1.VnB(), 43, 2);
8520 __ Ptrue(p0.VnB(), SVE_VL7);
8521 __ Rdvl(x0, 0);
8522 __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, x0));
8523
8524 __ Index(z4.VnH(), 42, 3);
8525 __ Index(z5.VnH(), 43, 3);
8526 __ Index(z6.VnH(), 44, 3);
8527 __ Ptrue(p1.VnH(), SVE_POW2);
8528 __ Rdvl(x1, 2);
8529 __ Lsr(x1, x1, 1);
8530 __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, x1, LSL, 1));
8531
8532 __ Index(z8.VnS(), 42, 4);
8533 __ Index(z9.VnS(), 43, 4);
8534 __ Index(z10.VnS(), 44, 4);
8535 __ Index(z11.VnS(), 45, 4);
8536 __ Ptrue(p2.VnS());
8537 __ Rdvl(x2, 2 + 3);
8538 __ Lsr(x2, x2, 2);
8539 __ St4w(z8.VnS(),
8540 z9.VnS(),
8541 z10.VnS(),
8542 z11.VnS(),
8543 p2,
8544 SVEMemOperand(sp, x2, LSL, 2));
8545
Jacob Bramleye483ce52019-11-05 16:52:29 +00008546 // Corresponding loads.
8547 // We have to explicitly zero inactive lanes in the reference values because
8548 // loads have zeroing predication.
8549 __ Dup(z12.VnB(), 0);
8550 __ Dup(z13.VnB(), 0);
8551 __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8552 __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8553 __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, x0));
8554
8555 __ Dup(z16.VnH(), 0);
8556 __ Dup(z17.VnH(), 0);
8557 __ Dup(z18.VnH(), 0);
8558 __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8559 __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8560 __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8561 __ Ld3h(z4.VnH(),
8562 z5.VnH(),
8563 z6.VnH(),
8564 p1.Zeroing(),
8565 SVEMemOperand(sp, x1, LSL, 1));
8566
8567 __ Dup(z20.VnS(), 0);
8568 __ Dup(z21.VnS(), 0);
8569 __ Dup(z22.VnS(), 0);
8570 __ Dup(z23.VnS(), 0);
8571 __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8572 __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8573 __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8574 __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8575 __ Ld4w(z8.VnS(),
8576 z9.VnS(),
8577 z10.VnS(),
8578 z11.VnS(),
8579 p2.Zeroing(),
8580 SVEMemOperand(sp, x2, LSL, 2));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008581
8582 __ DropVL(2 + 3 + 4);
8583
8584 END();
8585
8586 if (CAN_RUN()) {
8587 RUN();
8588
8589 // The most likely failure mode is the that simulator reads sp as xzr and
8590 // crashes on execution. We already test the address calculations separately
8591 // and sp doesn't change this, so just test that we load the values we
8592 // stored.
Jacob Bramleye483ce52019-11-05 16:52:29 +00008593
8594 // st2b/ld2b
8595 ASSERT_EQUAL_SVE(z0, z12);
8596 ASSERT_EQUAL_SVE(z1, z13);
8597
8598 // st3h/ld3h
8599 ASSERT_EQUAL_SVE(z4, z16);
8600 ASSERT_EQUAL_SVE(z5, z17);
8601 ASSERT_EQUAL_SVE(z6, z18);
8602
8603 // st4h/ld4h
8604 ASSERT_EQUAL_SVE(z8, z20);
8605 ASSERT_EQUAL_SVE(z9, z21);
8606 ASSERT_EQUAL_SVE(z10, z22);
8607 ASSERT_EQUAL_SVE(z11, z23);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008608 }
8609}
8610
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008611TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp) {
8612 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8613 START();
8614
8615 // Check that the simulator correctly interprets rn == 31 as sp.
8616 // The indexing logic is the same regardless so we just check one load and
8617 // store of each type.
8618
8619 // There are no pre- or post-indexing modes, so reserve space first.
8620 // Note that the stores fill in an order that allows each immediate to be a
8621 // multiple of the number of registers.
8622 __ ClaimVL(4 + 2 + 3);
8623
8624 __ Index(z0.VnB(), 42, 2);
8625 __ Index(z1.VnB(), 43, 2);
8626 __ Ptrue(p0.VnB(), SVE_POW2);
8627 __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, 4, SVE_MUL_VL));
8628
8629 __ Index(z4.VnH(), 42, 3);
8630 __ Index(z5.VnH(), 43, 3);
8631 __ Index(z6.VnH(), 44, 3);
8632 __ Ptrue(p1.VnH(), SVE_VL7);
8633 __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, 6, SVE_MUL_VL));
8634
8635 __ Index(z8.VnS(), 42, 4);
8636 __ Index(z9.VnS(), 43, 4);
8637 __ Index(z10.VnS(), 44, 4);
8638 __ Index(z11.VnS(), 45, 4);
8639 __ Ptrue(p2.VnS());
8640 __ St4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2, SVEMemOperand(sp));
8641
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008642 // Corresponding loads.
8643 // We have to explicitly zero inactive lanes in the reference values because
8644 // loads have zeroing predication.
8645 __ Dup(z12.VnB(), 0);
8646 __ Dup(z13.VnB(), 0);
8647 __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8648 __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8649 __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, 4, SVE_MUL_VL));
8650
8651 __ Dup(z16.VnH(), 0);
8652 __ Dup(z17.VnH(), 0);
8653 __ Dup(z18.VnH(), 0);
8654 __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8655 __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8656 __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8657 __ Ld3h(z4.VnH(),
8658 z5.VnH(),
8659 z6.VnH(),
8660 p1.Zeroing(),
8661 SVEMemOperand(sp, 6, SVE_MUL_VL));
8662
8663 __ Dup(z20.VnS(), 0);
8664 __ Dup(z21.VnS(), 0);
8665 __ Dup(z22.VnS(), 0);
8666 __ Dup(z23.VnS(), 0);
8667 __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8668 __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8669 __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8670 __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8671 __ Ld4w(z8.VnS(),
8672 z9.VnS(),
8673 z10.VnS(),
8674 z11.VnS(),
8675 p2.Zeroing(),
8676 SVEMemOperand(sp));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008677
8678 __ DropVL(4 + 2 + 3);
8679
8680 END();
8681
8682 if (CAN_RUN()) {
8683 RUN();
8684
8685 // The most likely failure mode is the that simulator reads sp as xzr and
8686 // crashes on execution. We already test the address calculations separately
8687 // and sp doesn't change this, so just test that we load the values we
8688 // stored.
8689 // TODO: Actually do this, once loads are implemented.
8690 }
8691}
8692
TatWai Chong85e15102020-05-04 21:00:40 -07008693// Fill the input buffer with arbitrary data. Meanwhile, assign random offsets
8694// from the base address of the buffer and corresponding addresses to the
8695// arguments if provided.
8696static void BufferFillingHelper(uint64_t data_ptr,
8697 size_t buffer_size,
8698 unsigned lane_size_in_bytes,
8699 int lane_count,
8700 uint64_t* offsets,
8701 uint64_t* addresses = nullptr,
8702 uint64_t* max_address = nullptr) {
8703 // Use a fixed seed for nrand48() so that test runs are reproducible.
8704 unsigned short seed[3] = {1, 2, 3}; // NOLINT(runtime/int)
8705
8706 // Fill a buffer with arbitrary data.
8707 for (size_t i = 0; i < buffer_size; i++) {
8708 uint8_t byte = nrand48(seed) & 0xff;
8709 memcpy(reinterpret_cast<void*>(data_ptr + i), &byte, 1);
8710 }
8711
8712 if (max_address != nullptr) {
8713 *max_address = 0;
8714 }
8715
8716 // Vectors of random addresses and offsets into the buffer.
8717 for (int i = 0; i < lane_count; i++) {
8718 uint64_t rnd = nrand48(seed);
8719 // Limit the range to the set of completely-accessible elements in memory.
8720 offsets[i] = rnd % (buffer_size - lane_size_in_bytes);
8721 if ((addresses != nullptr) && (max_address != nullptr)) {
8722 addresses[i] = data_ptr + offsets[i];
8723 *max_address = std::max(*max_address, addresses[i]);
8724 }
8725 }
8726}
8727
TatWai Chong85e15102020-05-04 21:00:40 -07008728static void ScalarLoadHelper(MacroAssembler* masm,
8729 Register dst,
8730 Register addr,
8731 int msize_in_bits,
8732 bool is_signed) {
8733 if (is_signed) {
8734 switch (msize_in_bits) {
8735 case kBRegSize:
8736 masm->Ldrsb(dst, MemOperand(addr));
8737 break;
8738 case kHRegSize:
8739 masm->Ldrsh(dst, MemOperand(addr));
8740 break;
8741 case kWRegSize:
8742 masm->Ldrsw(dst, MemOperand(addr));
8743 break;
8744 default:
8745 VIXL_UNIMPLEMENTED();
8746 break;
8747 }
8748 } else {
8749 switch (msize_in_bits) {
8750 case kBRegSize:
8751 masm->Ldrb(dst, MemOperand(addr));
8752 break;
8753 case kHRegSize:
8754 masm->Ldrh(dst, MemOperand(addr));
8755 break;
8756 case kWRegSize:
8757 masm->Ldr(dst.W(), MemOperand(addr));
8758 break;
8759 case kXRegSize:
8760 masm->Ldr(dst, MemOperand(addr));
8761 break;
8762 default:
8763 VIXL_UNIMPLEMENTED();
8764 break;
8765 }
8766 }
8767}
8768
8769// Generate a reference result using scalar loads.
8770// For now this helper doesn't save and restore the caller registers.
8771// Clobber register z30, x28, x29 and p7.
8772template <size_t N>
8773static void ScalarLoadHelper(MacroAssembler* masm,
8774 int vl,
8775 const uint64_t (&addresses)[N],
8776 const ZRegister& zt_ref,
8777 const PRegisterZ& pg,
8778 unsigned esize_in_bits,
8779 unsigned msize_in_bits,
8780 bool is_signed) {
8781 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8782 ZRegister lane_numbers = z30.WithLaneSize(esize_in_bits);
8783 masm->Index(lane_numbers, 0, 1);
8784 masm->Dup(zt_ref, 0);
8785 for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
8786 masm->Mov(x29, addresses[N - i - 1]);
8787 Register rt(28, std::min(std::max(esize_in_bits, kSRegSize), kDRegSize));
8788 ScalarLoadHelper(masm, rt, x29, msize_in_bits, is_signed);
8789
8790 // Emulate predication.
8791 masm->Cmpeq(p7.WithLaneSize(esize_in_bits), pg, lane_numbers, i);
8792 masm->Cpy(zt_ref, p7.Merging(), rt);
8793 }
8794}
8795
TatWai Chong113d9192020-05-19 01:02:36 -07008796typedef void (MacroAssembler::*Ld1Macro)(const ZRegister& zt,
8797 const PRegisterZ& pg,
8798 const SVEMemOperand& addr);
8799
Martyn Capewella5112342020-06-05 18:20:11 +01008800template <typename T>
TatWai Chong6537a9a2020-05-05 14:15:16 -07008801static void Ldff1Helper(Test* config,
8802 uintptr_t data,
8803 unsigned msize_in_bits,
8804 unsigned esize_in_bits,
TatWai Chong1af34f12020-06-01 20:54:06 -07008805 CPURegister::RegisterType base_type,
TatWai Chong6537a9a2020-05-05 14:15:16 -07008806 Ld1Macro ldff1,
8807 Ld1Macro ld1,
Martyn Capewella5112342020-06-05 18:20:11 +01008808 T mod,
TatWai Chong6537a9a2020-05-05 14:15:16 -07008809 bool scale = false) {
8810 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8811 START();
8812
8813 int vl = config->sve_vl_in_bytes();
8814 size_t page_size = sysconf(_SC_PAGE_SIZE);
8815 VIXL_ASSERT(page_size > static_cast<size_t>(vl));
8816
8817 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8818 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
8819 unsigned msize_in_bytes_log2 = std::log2(msize_in_bytes);
8820 VIXL_ASSERT(msize_in_bits <= esize_in_bits);
8821
8822 PRegister all = p7;
8823 __ Ptrue(all.VnB());
8824
8825 size_t offset_modifier = 0;
8826
Martyn Capewell5f9b3802020-03-24 16:16:36 +00008827 // The highest address at which a load stopped. Every FF load should fault at
TatWai Chong6537a9a2020-05-05 14:15:16 -07008828 // `data + page_size`, so this value should not exceed that value. However,
8829 // the architecture allows fault-tolerant loads to fault arbitrarily, so the
8830 // real value may be lower.
8831 //
8832 // This is used to check that the `mprotect` above really does make the second
8833 // page inaccessible, and that the resulting FFR from each load reflects that.
8834 Register limit = x22;
8835 __ Mov(limit, 0);
8836
8837 // If the FFR grows unexpectedly, we increment this register by the
8838 // difference. FFR should never grow, except when explicitly set.
8839 Register ffr_grow_count = x23;
8840 __ Mov(ffr_grow_count, 0);
8841
8842 // Set the offset so that the load is guaranteed to start in the
8843 // accessible page, but end in the inaccessible one.
8844 VIXL_ASSERT((page_size % msize_in_bytes) == 0);
8845 VIXL_ASSERT((vl % msize_in_bytes) == 0);
8846 size_t elements_per_page = page_size / msize_in_bytes;
8847 size_t elements_per_access = vl / esize_in_bytes;
8848 size_t min_offset = (elements_per_page - elements_per_access) + 1;
8849 size_t max_offset = elements_per_page - 1;
8850 size_t offset =
8851 min_offset + (offset_modifier % (max_offset - min_offset + 1));
8852 offset_modifier++;
8853
8854 __ Setffr();
8855 __ Mov(x20, data);
8856 __ Mov(x21, offset);
8857
TatWai Chong1af34f12020-06-01 20:54:06 -07008858 if (base_type == CPURegister::kRegister) {
8859 // Scalar-plus-scalar mode.
Martyn Capewella5112342020-06-05 18:20:11 +01008860 VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8861 VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8862 (static_cast<int>(mod) == NO_SHIFT));
8863 (masm.*ldff1)(z0.WithLaneSize(esize_in_bits),
8864 all.Zeroing(),
8865 SVEMemOperand(x20, x21, mod, msize_in_bytes_log2));
8866 } else {
8867 VIXL_ASSERT(base_type == CPURegister::kZRegister);
TatWai Chong1af34f12020-06-01 20:54:06 -07008868 int offs_size;
8869 bool offs_is_unsigned;
Martyn Capewella5112342020-06-05 18:20:11 +01008870 if (std::is_same<T, vixl::aarch64::Extend>::value) {
TatWai Chong1af34f12020-06-01 20:54:06 -07008871 // Scalar-plus-vector mode with 32-bit optional unpacked or upacked, and
8872 // unscaled or scaled offset.
Martyn Capewella5112342020-06-05 18:20:11 +01008873 VIXL_ASSERT((static_cast<int>(mod) == SXTW) ||
8874 (static_cast<int>(mod) == UXTW));
TatWai Chong1af34f12020-06-01 20:54:06 -07008875 if (scale == true) {
8876 // Gather first-fault bytes load doesn't support scaled offset.
8877 VIXL_ASSERT(msize_in_bits != kBRegSize);
8878 }
Martyn Capewella5112342020-06-05 18:20:11 +01008879 offs_is_unsigned = (static_cast<int>(mod) == UXTW) ? true : false;
TatWai Chong1af34f12020-06-01 20:54:06 -07008880 offs_size = kSRegSize;
8881
8882 } else {
8883 // Scalar-plus-vector mode with 64-bit unscaled or scaled offset.
Martyn Capewella5112342020-06-05 18:20:11 +01008884 VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8885 VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8886 (static_cast<int>(mod) == NO_SHIFT));
TatWai Chong1af34f12020-06-01 20:54:06 -07008887 offs_is_unsigned = false;
8888 offs_size = kDRegSize;
8889 }
8890
TatWai Chong6537a9a2020-05-05 14:15:16 -07008891 // For generating the pattern of "base address + index << shift".
8892 // In case of unscaled-offset operation, use `msize_in_bytes` be an offset
8893 // of each decreasing memory accesses. otherwise, decreases the indexes by 1
8894 // and then scale it by the shift value.
8895 int shift = (scale == true) ? msize_in_bytes_log2 : 0;
8896 int index_offset = msize_in_bytes >> shift;
8897 VIXL_ASSERT(index_offset > 0);
TatWai Chong6537a9a2020-05-05 14:15:16 -07008898 uint64_t index = 0;
8899 uint64_t base_address = 0;
8900
TatWai Chong1af34f12020-06-01 20:54:06 -07008901 if (offs_is_unsigned == true) {
TatWai Chong6537a9a2020-05-05 14:15:16 -07008902 // Base address.
8903 base_address = data;
8904 // Maximum unsigned positive index.
8905 index = page_size >> shift;
8906
8907 } else {
8908 // Base address.
8909 base_address = data + (2 * page_size);
8910 // Maximum unsigned positive index.
8911 uint64_t uint_e_max =
8912 (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
8913 index = uint_e_max - (page_size >> shift) + 1;
8914 }
8915
8916 __ Mov(x19, base_address);
8917 if ((offs_size == kSRegSize) && (esize_in_bits == kDRegSize)) {
8918 // In this case, the index values are optionally sign or zero-extended
8919 // from 32 to 64 bits, assign a convenient value to the top 32 bits to
8920 // ensure only the low 32 bits be the index values.
8921 index |= 0x1234567800000000;
8922 }
8923
8924 index -= index_offset * (elements_per_access - 1);
8925 __ Index(z17.WithLaneSize(esize_in_bits), index, index_offset);
8926
8927 // Scalar plus vector mode.
8928 (masm.*
8929 ldff1)(z0.WithLaneSize(esize_in_bits),
8930 all.Zeroing(),
8931 SVEMemOperand(x19, z17.WithLaneSize(esize_in_bits), mod, shift));
8932 }
8933
8934 __ Rdffrs(p0.VnB(), all.Zeroing());
8935
8936 // Execute another Ldff1 with no offset, so that every element could be
8937 // read. It should respect FFR, and load no more than we loaded the
8938 // first time.
8939 (masm.*
8940 ldff1)(z16.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x20));
8941 __ Rdffrs(p1.VnB(), all.Zeroing());
8942 __ Cntp(x0, all, p1.VnB());
8943 __ Uqdecp(x0, p0.VnB());
8944 __ Add(ffr_grow_count, ffr_grow_count, x0);
8945
8946 // Use the FFR to predicate the normal load. If it wasn't properly set,
8947 // the normal load will abort.
8948 (masm.*ld1)(z16.WithLaneSize(esize_in_bits),
8949 p0.Zeroing(),
8950 SVEMemOperand(x20, x21, LSL, msize_in_bytes_log2));
8951
8952 // Work out the address after the one that was just accessed.
8953 __ Incp(x21, p0.WithLaneSize(esize_in_bits));
8954 __ Add(x0, x20, Operand(x21, LSL, msize_in_bytes_log2));
8955 __ Cmp(limit, x0);
8956 __ Csel(limit, limit, x0, hs);
8957
8958 // Clear lanes inactive in FFR. These have an undefined result.
Martyn Capewella24d95c2020-05-20 11:11:15 +01008959 __ Not(p0.VnB(), all.Zeroing(), p0.VnB());
Martyn Capewelle2de6072020-05-22 09:52:06 +01008960 __ Mov(z0.WithLaneSize(esize_in_bits), p0.Merging(), 0);
TatWai Chong6537a9a2020-05-05 14:15:16 -07008961
8962 END();
8963
8964 if (CAN_RUN()) {
8965 RUN();
8966
8967 uintptr_t expected_limit = data + page_size;
8968 uintptr_t measured_limit = core.xreg(limit.GetCode());
8969 VIXL_CHECK(measured_limit <= expected_limit);
8970 if (measured_limit < expected_limit) {
8971 // We can't fail the test for this case, but a warning is helpful for
8972 // manually-run tests.
8973 printf(
8974 "WARNING: All fault-tolerant loads detected faults before the\n"
8975 "expected limit. This is architecturally possible, but improbable,\n"
8976 "and could be a symptom of another problem.\n");
8977 }
8978
8979 ASSERT_EQUAL_64(0, ffr_grow_count);
8980
8981 ASSERT_EQUAL_SVE(z0.WithLaneSize(esize_in_bits),
8982 z16.WithLaneSize(esize_in_bits));
8983 }
8984}
8985
8986TEST_SVE(sve_ldff1_scalar_plus_scalar) {
8987 size_t page_size = sysconf(_SC_PAGE_SIZE);
8988 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
8989
8990 // Allocate two pages, then mprotect the second one to make it inaccessible.
8991 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
8992 page_size * 2,
8993 PROT_READ | PROT_WRITE,
8994 MAP_PRIVATE | MAP_ANONYMOUS,
8995 -1,
8996 0));
8997 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
8998
8999 // Fill the accessible page with arbitrary data.
9000 for (size_t i = 0; i < page_size; i++) {
9001 // Reverse bits so we get a mixture of positive and negative values.
9002 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9003 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9004 }
9005
Martyn Capewella5112342020-06-05 18:20:11 +01009006 auto ldff1_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009007 config,
9008 data,
9009 std::placeholders::_1,
9010 std::placeholders::_2,
9011 CPURegister::kRegister,
9012 std::placeholders::_3,
9013 std::placeholders::_4,
Martyn Capewella5112342020-06-05 18:20:11 +01009014 NO_SHIFT,
TatWai Chong1af34f12020-06-01 20:54:06 -07009015 false);
9016
TatWai Chong6537a9a2020-05-05 14:15:16 -07009017 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9018 Ld1Macro ld1b = &MacroAssembler::Ld1b;
TatWai Chong1af34f12020-06-01 20:54:06 -07009019 ldff1_unscaled_offset_helper(kBRegSize, kBRegSize, ldff1b, ld1b);
9020 ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1b, ld1b);
9021 ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1b, ld1b);
9022 ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1b, ld1b);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009023
9024 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9025 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
TatWai Chong1af34f12020-06-01 20:54:06 -07009026 ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1sb, ld1sb);
9027 ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1sb, ld1sb);
9028 ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1sb, ld1sb);
9029
Martyn Capewella5112342020-06-05 18:20:11 +01009030 auto ldff1_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009031 config,
9032 data,
9033 std::placeholders::_1,
9034 std::placeholders::_2,
9035 CPURegister::kRegister,
9036 std::placeholders::_3,
9037 std::placeholders::_4,
Martyn Capewella5112342020-06-05 18:20:11 +01009038 LSL,
TatWai Chong1af34f12020-06-01 20:54:06 -07009039 true);
9040
9041 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9042 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9043 ldff1_scaled_offset_helper(kHRegSize, kHRegSize, ldff1h, ld1h);
9044 ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1h, ld1h);
9045 ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1h, ld1h);
9046
9047 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9048 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9049 ldff1_scaled_offset_helper(kSRegSize, kSRegSize, ldff1w, ld1w);
9050 ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1w, ld1w);
9051
9052 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9053 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9054 ldff1_scaled_offset_helper(kDRegSize, kDRegSize, ldff1d, ld1d);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009055
9056 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9057 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
TatWai Chong1af34f12020-06-01 20:54:06 -07009058 ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1sh, ld1sh);
9059 ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1sh, ld1sh);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009060
9061 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9062 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
TatWai Chong1af34f12020-06-01 20:54:06 -07009063 ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1sw, ld1sw);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009064
9065 munmap(reinterpret_cast<void*>(data), page_size * 2);
9066}
9067
TatWai Chong1af34f12020-06-01 20:54:06 -07009068static void sve_ldff1_scalar_plus_vector_32_scaled_offset(Test* config,
9069 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009070 auto ldff1_32_scaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009071 config,
9072 data,
9073 std::placeholders::_1,
9074 kSRegSize,
9075 CPURegister::kZRegister,
9076 std::placeholders::_2,
9077 std::placeholders::_3,
9078 std::placeholders::_4,
9079 true);
9080 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9081 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009082 ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9083 ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009084
9085 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9086 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009087 ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9088 ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009089
9090 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9091 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009092 ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9093 ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009094}
9095
9096static void sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test* config,
9097 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009098 auto ldff1_32_unscaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009099 config,
9100 data,
9101 std::placeholders::_1,
9102 kSRegSize,
9103 CPURegister::kZRegister,
9104 std::placeholders::_2,
9105 std::placeholders::_3,
9106 std::placeholders::_4,
9107 false);
9108
9109 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9110 Ld1Macro ld1b = &MacroAssembler::Ld1b;
Martyn Capewella5112342020-06-05 18:20:11 +01009111 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9112 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009113
9114 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9115 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009116 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9117 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009118
9119 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9120 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009121 ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9122 ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009123
9124 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9125 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
Martyn Capewella5112342020-06-05 18:20:11 +01009126 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9127 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009128
9129 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9130 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009131 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9132 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009133}
9134
9135static void sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(
9136 Test* config, uintptr_t data) {
9137 auto ldff1_32_unpacked_scaled_offset_helper =
Martyn Capewella5112342020-06-05 18:20:11 +01009138 std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009139 config,
9140 data,
9141 std::placeholders::_1,
9142 kDRegSize,
9143 CPURegister::kZRegister,
9144 std::placeholders::_2,
9145 std::placeholders::_3,
9146 std::placeholders::_4,
9147 true);
9148
9149 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9150 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009151 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9152 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009153
9154 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9155 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009156 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9157 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009158
9159 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9160 Ld1Macro ld1d = &MacroAssembler::Ld1d;
Martyn Capewella5112342020-06-05 18:20:11 +01009161 ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9162 ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009163
9164 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9165 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009166 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9167 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009168
9169 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9170 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Martyn Capewella5112342020-06-05 18:20:11 +01009171 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9172 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009173}
9174
9175static void sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(
9176 Test* config, uintptr_t data) {
9177 auto ldff1_32_unpacked_unscaled_offset_helper =
Martyn Capewella5112342020-06-05 18:20:11 +01009178 std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009179 config,
9180 data,
9181 std::placeholders::_1,
9182 kDRegSize,
9183 CPURegister::kZRegister,
9184 std::placeholders::_2,
9185 std::placeholders::_3,
9186 std::placeholders::_4,
9187 false);
9188
9189 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9190 Ld1Macro ld1b = &MacroAssembler::Ld1b;
Martyn Capewella5112342020-06-05 18:20:11 +01009191 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9192 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009193
9194 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9195 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009196 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9197 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009198
9199 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9200 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009201 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9202 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009203
9204 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9205 Ld1Macro ld1d = &MacroAssembler::Ld1d;
Martyn Capewella5112342020-06-05 18:20:11 +01009206 ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9207 ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009208
9209 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9210 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
Martyn Capewella5112342020-06-05 18:20:11 +01009211 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9212 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009213
9214 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9215 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009216 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9217 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009218
9219 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9220 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Martyn Capewella5112342020-06-05 18:20:11 +01009221 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9222 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009223}
9224
9225static void sve_ldff1_scalar_plus_vector_64_scaled_offset(Test* config,
9226 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009227 auto ldff1_64_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009228 config,
9229 data,
9230 std::placeholders::_1,
9231 kDRegSize,
9232 CPURegister::kZRegister,
9233 std::placeholders::_2,
9234 std::placeholders::_3,
Martyn Capewella5112342020-06-05 18:20:11 +01009235 LSL,
TatWai Chong1af34f12020-06-01 20:54:06 -07009236 true);
9237
9238 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9239 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9240 ldff1_64_scaled_offset_helper(kHRegSize, ldff1h, ld1h);
9241
9242 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9243 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9244 ldff1_64_scaled_offset_helper(kSRegSize, ldff1w, ld1w);
9245
9246 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9247 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9248 ldff1_64_scaled_offset_helper(kDRegSize, ldff1d, ld1d);
9249
9250 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9251 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9252 ldff1_64_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9253
9254 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9255 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9256 ldff1_64_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9257}
9258
9259static void sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test* config,
9260 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009261 auto ldff1_64_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009262 config,
9263 data,
9264 std::placeholders::_1,
9265 kDRegSize,
9266 CPURegister::kZRegister,
9267 std::placeholders::_2,
9268 std::placeholders::_3,
Martyn Capewella5112342020-06-05 18:20:11 +01009269 NO_SHIFT,
TatWai Chong1af34f12020-06-01 20:54:06 -07009270 false);
9271
9272 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9273 Ld1Macro ld1b = &MacroAssembler::Ld1b;
9274 ldff1_64_unscaled_offset_helper(kBRegSize, ldff1b, ld1b);
9275
9276 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9277 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9278 ldff1_64_unscaled_offset_helper(kHRegSize, ldff1h, ld1h);
9279
9280 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9281 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9282 ldff1_64_unscaled_offset_helper(kSRegSize, ldff1w, ld1w);
9283
9284 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9285 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9286 ldff1_64_unscaled_offset_helper(kDRegSize, ldff1d, ld1d);
9287
9288 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9289 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9290 ldff1_64_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb);
9291
9292 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9293 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9294 ldff1_64_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9295
9296 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9297 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9298 ldff1_64_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9299}
9300
TatWai Chong6537a9a2020-05-05 14:15:16 -07009301TEST_SVE(sve_ldff1_scalar_plus_vector) {
9302 size_t page_size = sysconf(_SC_PAGE_SIZE);
9303 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9304
9305 // Allocate two pages, then mprotect the second one to make it inaccessible.
9306 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9307 page_size * 2,
9308 PROT_READ | PROT_WRITE,
9309 MAP_PRIVATE | MAP_ANONYMOUS,
9310 -1,
9311 0));
9312 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9313
9314 // Fill the accessible page with arbitrary data.
9315 for (size_t i = 0; i < page_size; i++) {
9316 // Reverse bits so we get a mixture of positive and negative values.
9317 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9318 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9319 }
9320
TatWai Chong1af34f12020-06-01 20:54:06 -07009321 sve_ldff1_scalar_plus_vector_32_scaled_offset(config, data);
9322 sve_ldff1_scalar_plus_vector_32_unscaled_offset(config, data);
9323 sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(config, data);
9324 sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(config, data);
9325 sve_ldff1_scalar_plus_vector_64_scaled_offset(config, data);
9326 sve_ldff1_scalar_plus_vector_64_unscaled_offset(config, data);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009327
9328 munmap(reinterpret_cast<void*>(data), page_size * 2);
9329}
9330
Martyn Capewell5f9b3802020-03-24 16:16:36 +00009331TEST_SVE(sve_ldnf1) {
9332 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
9333 CPUFeatures::kNEON,
9334 CPUFeatures::kFP);
9335 START();
9336
9337 size_t page_size = sysconf(_SC_PAGE_SIZE);
9338 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9339
9340 // Allocate two pages, fill them with data, then mprotect the second one to
9341 // make it inaccessible.
9342 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9343 page_size * 2,
9344 PROT_READ | PROT_WRITE,
9345 MAP_PRIVATE | MAP_ANONYMOUS,
9346 -1,
9347 0));
9348
9349 // Fill the pages with arbitrary data.
9350 for (size_t i = 0; i < page_size; i++) {
9351 // Reverse bits so we get a mixture of positive and negative values.
9352 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9353 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9354 }
9355
9356 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9357
9358 __ Setffr();
9359 __ Ptrue(p0.VnB());
9360 __ Dup(z10.VnB(), 0);
9361
9362 // Move an address that points to the last unprotected eight bytes.
9363 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kBRegSizeInBytes) / 2);
9364
9365 // Load, non-faulting, a vector of bytes from x0. At most, eight bytes will be
9366 // loaded, the rest being in a protected page.
9367 __ Ldnf1b(z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
9368 __ Rdffr(p1.VnB());
9369 __ Setffr();
9370
9371 // Create references using the FFR value in p1 to zero the undefined lanes.
9372 __ Sel(z0.VnB(), p1, z0.VnB(), z10.VnB());
9373 __ Ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0));
9374
9375 // Repeat for larger elements and different addresses, giving different FFR
9376 // results.
9377 __ Add(x1, x0, 1);
9378 __ Ldnf1h(z1.VnH(), p0.Zeroing(), SVEMemOperand(x1));
9379 __ Rdffr(p1.VnB());
9380 __ Setffr();
9381 __ Sel(z1.VnH(), p1, z1.VnH(), z10.VnH());
9382 __ Ld1h(z21.VnH(), p1.Zeroing(), SVEMemOperand(x1));
9383
9384 __ Add(x1, x0, 2);
9385 __ Ldnf1w(z2.VnS(), p0.Zeroing(), SVEMemOperand(x1));
9386 __ Rdffr(p1.VnB());
9387 __ Setffr();
9388 __ Sel(z2.VnS(), p1, z2.VnS(), z10.VnS());
9389 __ Ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x1));
9390
9391 __ Sub(x1, x0, 1);
9392 __ Ldnf1d(z3.VnD(), p0.Zeroing(), SVEMemOperand(x1));
9393 __ Rdffr(p1.VnB());
9394 __ Setffr();
9395 __ Sel(z3.VnD(), p1, z3.VnD(), z10.VnD());
9396 __ Ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x1));
9397
9398 // Load from previous VL-sized area of memory. All of this should be in the
9399 // accessible page.
9400 __ Ldnf1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9401 __ Rdffr(p1.VnB());
9402 __ Setffr();
9403 __ Sel(z4.VnB(), p1, z4.VnB(), z10.VnB());
9404 __ Ld1b(z24.VnB(), p1.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9405
9406 // Repeat partial load for larger element size.
9407 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kSRegSizeInBytes) / 2);
9408 __ Ldnf1b(z5.VnS(), p0.Zeroing(), SVEMemOperand(x0));
9409 __ Rdffr(p1.VnB());
9410 __ Setffr();
9411 __ Sel(z5.VnS(), p1, z5.VnS(), z10.VnS());
9412 __ Ld1b(z25.VnS(), p1.Zeroing(), SVEMemOperand(x0));
9413
9414 // Repeat for sign extension.
9415 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kHRegSizeInBytes) / 2);
9416 __ Ldnf1sb(z6.VnH(), p0.Zeroing(), SVEMemOperand(x0));
9417 __ Rdffr(p1.VnB());
9418 __ Setffr();
9419 __ Sel(z6.VnH(), p1, z6.VnH(), z10.VnH());
9420 __ Ld1sb(z26.VnH(), p1.Zeroing(), SVEMemOperand(x0));
9421
9422 END();
9423
9424 if (CAN_RUN()) {
9425 RUN();
9426 ASSERT_EQUAL_SVE(z20, z0);
9427 ASSERT_EQUAL_SVE(z21, z1);
9428 ASSERT_EQUAL_SVE(z22, z2);
9429 ASSERT_EQUAL_SVE(z23, z3);
9430 ASSERT_EQUAL_SVE(z24, z4);
9431 ASSERT_EQUAL_SVE(z25, z5);
9432 ASSERT_EQUAL_SVE(z26, z6);
9433 }
9434
9435 munmap(reinterpret_cast<void*>(data), page_size * 2);
9436}
9437
TatWai Chongcd3f6c52020-06-14 00:42:39 -07009438// Emphasis on test if the modifiers are propagated and simulated correctly.
9439TEST_SVE(sve_ldff1_regression_test) {
9440 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9441 START();
9442
9443 size_t page_size = sysconf(_SC_PAGE_SIZE);
9444 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9445
9446 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9447 page_size * 2,
9448 PROT_READ | PROT_WRITE,
9449 MAP_PRIVATE | MAP_ANONYMOUS,
9450 -1,
9451 0));
9452 uintptr_t middle = data + page_size;
9453 // Fill the accessible page with arbitrary data.
9454 for (size_t i = 0; i < page_size; i++) {
9455 // Reverse bits so we get a mixture of positive and negative values.
9456 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9457 memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9458 // Make one bit roughly different in every byte and copy the bytes in the
9459 // reverse direction that convenient to verifying the loads in negative
9460 // indexes.
9461 byte += 1;
9462 memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9463 }
9464
9465 PRegister all = p6;
9466 __ Ptrue(all.VnB());
9467
9468 __ Mov(x0, middle);
9469 __ Index(z31.VnS(), 0, 3);
9470 __ Neg(z30.VnS(), z31.VnS());
9471
9472 __ Setffr();
9473
9474 // Scalar plus vector 32 unscaled offset
9475 __ Ldff1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9476 __ Ldff1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9477 __ Ldff1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9478 __ Ldff1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9479 __ Ldff1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9480
9481 // Scalar plus vector 32 scaled offset
9482 __ Ldff1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9483 __ Ldff1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9484 __ Ldff1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9485
9486 __ Index(z31.VnD(), 0, 3);
9487 __ Neg(z30.VnD(), z31.VnD());
9488
9489 // Ensure only the low 32 bits are used for the testing with positive index
9490 // values. It also test if the indexes are treated as positive in `uxtw` form.
9491 __ Mov(x3, 0x8000000080000000);
9492 __ Dup(z28.VnD(), x3);
9493 __ Sub(x2, x0, 0x80000000);
9494 __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9495
9496 // Scalar plus vector 32 unpacked unscaled offset
9497 __ Ldff1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9498 __ Ldff1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9499 __ Ldff1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9500 __ Ldff1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9501 __ Ldff1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9502 __ Ldff1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9503
9504 // Scalar plus vector 32 unpacked scaled offset
9505 __ Ldff1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9506 __ Ldff1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9507 __ Ldff1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9508 __ Ldff1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9509 __ Ldff1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9510
9511 __ Sub(x0, x0, x3);
9512 // Note that the positive indexes has been added by `0x8000000080000000`. The
9513 // wrong address will be accessed if the address is treated as negative.
9514
9515 // Scalar plus vector 64 unscaled offset
9516 __ Ldff1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9517 __ Ldff1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9518 __ Ldff1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9519 __ Ldff1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9520 __ Ldff1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9521
9522 // Scalar plus vector 64 scaled offset
9523 __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
9524 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9525 __ Ldff1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9526 __ Ldff1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9527
9528 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
9529 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9530 __ Ldff1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9531 __ Ldff1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9532
9533 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
9534 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9535 __ Ldff1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9536
9537 __ Rdffr(p1.VnB());
9538 __ Cntp(x10, all, p1.VnB());
9539
9540 END();
9541
9542 if (CAN_RUN()) {
9543 RUN();
9544
9545 int64_t loaded_data_in_bytes = core.xreg(x10.GetCode());
9546 // Only check 128 bits in this test.
9547 if (loaded_data_in_bytes < kQRegSizeInBytes) {
9548 // Report a warning when we hit fault-tolerant loads before all expected
9549 // loads performed.
9550 printf(
9551 "WARNING: Fault-tolerant loads detected faults before the "
9552 "expected loads completed.\n");
9553 return;
9554 }
9555
9556 // Scalar plus vector 32 unscaled offset
9557 uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9558 uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9559 uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9560 uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9561 uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9562
9563 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9564 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9565 ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9566 ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9567 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9568
9569 // Scalar plus vector 32 scaled offset
9570 uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9571 uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9572 uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9573
9574 ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9575 ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9576 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9577
9578 // Scalar plus vector 32 unpacked unscaled offset
9579 uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9580 uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9581 uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9582 uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9583 uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9584 uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9585
9586 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9587 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9588 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9589 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9590 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9591 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9592
9593 // Scalar plus vector 32 unpacked scaled offset
9594 uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9595 uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9596 uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9597 uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9598 uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9599
9600 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9601 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9602 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9603 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9604 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9605
9606 // Scalar plus vector 64 unscaled offset
9607 uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9608 uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9609 uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9610 uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9611 uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9612
9613 ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9614 ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9615 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9616 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9617 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9618
9619 uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9620 uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9621 uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9622 uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9623 uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9624
9625 // Scalar plus vector 64 scaled offset
9626 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9627 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9628 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9629 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9630 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9631 }
9632}
9633
Martyn Capewella5112342020-06-05 18:20:11 +01009634// Emphasis on test if the modifiers are propagated and simulated correctly.
9635TEST_SVE(sve_ld1_regression_test) {
9636 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9637 START();
9638
9639 size_t page_size = sysconf(_SC_PAGE_SIZE);
9640 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9641
9642 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9643 page_size * 2,
9644 PROT_READ | PROT_WRITE,
9645 MAP_PRIVATE | MAP_ANONYMOUS,
9646 -1,
9647 0));
9648 uintptr_t middle = data + page_size;
9649 // Fill the accessible page with arbitrary data.
9650 for (size_t i = 0; i < page_size; i++) {
9651 // Reverse bits so we get a mixture of positive and negative values.
9652 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9653 memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9654 // Make one bit roughly different in every byte and copy the bytes in the
9655 // reverse direction that convenient to verifying the loads in negative
9656 // indexes.
9657 byte += 1;
9658 memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9659 }
9660
9661 PRegister all = p6;
9662 __ Ptrue(all.VnB());
9663
9664 __ Mov(x0, middle);
9665 __ Index(z31.VnS(), 0, 3);
9666 __ Neg(z30.VnS(), z31.VnS());
9667
9668 // Scalar plus vector 32 unscaled offset
9669 __ Ld1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9670 __ Ld1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9671 __ Ld1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9672 __ Ld1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9673 __ Ld1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9674
9675 // Scalar plus vector 32 scaled offset
9676 __ Ld1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9677 __ Ld1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9678 __ Ld1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9679
9680 __ Index(z31.VnD(), 0, 3);
9681 __ Neg(z30.VnD(), z31.VnD());
9682
9683 // Ensure only the low 32 bits are used for the testing with positive index
9684 // values. It also test if the indexes are treated as positive in `uxtw` form.
9685 __ Mov(x3, 0x8000000080000000);
9686 __ Dup(z28.VnD(), x3);
9687 __ Sub(x2, x0, 0x80000000);
9688 __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9689
9690 // Scalar plus vector 32 unpacked unscaled offset
9691 __ Ld1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9692 __ Ld1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9693 __ Ld1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9694 __ Ld1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9695 __ Ld1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9696 __ Ld1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9697
9698 // Scalar plus vector 32 unpacked scaled offset
9699 __ Ld1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9700 __ Ld1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9701 __ Ld1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9702 __ Ld1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9703 __ Ld1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9704
9705 __ Sub(x0, x0, x3);
9706 // Note that the positive indexes has been added by `0x8000000080000000`. The
9707 // wrong address will be accessed if the address is treated as negative.
9708
9709 // Scalar plus vector 64 unscaled offset
9710 __ Ld1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9711 __ Ld1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9712 __ Ld1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9713 __ Ld1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9714 __ Ld1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9715
9716 // Scalar plus vector 64 scaled offset
9717 __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
9718 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9719 __ Ld1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9720 __ Ld1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9721
9722 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
9723 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9724 __ Ld1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9725 __ Ld1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9726
9727 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
9728 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9729 __ Ld1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9730
9731 END();
9732
9733 if (CAN_RUN()) {
9734 RUN();
9735
9736 // Scalar plus vector 32 unscaled offset
9737 uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9738 uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9739 uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9740 uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9741 uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9742
9743 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9744 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9745 ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9746 ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9747 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9748
9749 // Scalar plus vector 32 scaled offset
9750 uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9751 uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9752 uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9753
9754 ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9755 ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9756 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9757
9758 // Scalar plus vector 32 unpacked unscaled offset
9759 uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9760 uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9761 uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9762 uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9763 uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9764 uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9765
9766 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9767 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9768 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9769 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9770 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9771 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9772
9773 // Scalar plus vector 32 unpacked scaled offset
9774 uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9775 uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9776 uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9777 uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9778 uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9779
9780 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9781 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9782 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9783 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9784 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9785
9786 // Scalar plus vector 64 unscaled offset
9787 uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9788 uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9789 uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9790 uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9791 uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9792
9793 ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9794 ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9795 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9796 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9797 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9798
9799 uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9800 uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9801 uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9802 uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9803 uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9804
9805 // Scalar plus vector 64 scaled offset
9806 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9807 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9808 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9809 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9810 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9811 }
9812}
9813
TatWai Chong113d9192020-05-19 01:02:36 -07009814// Test gather loads by comparing them with the result of a set of equivalent
9815// scalar loads.
Martyn Capewella5112342020-06-05 18:20:11 +01009816template <typename T>
TatWai Chong113d9192020-05-19 01:02:36 -07009817static void GatherLoadScalarPlusVectorHelper(Test* config,
9818 unsigned msize_in_bits,
9819 unsigned esize_in_bits,
9820 Ld1Macro ld1,
TatWai Chong6537a9a2020-05-05 14:15:16 -07009821 Ld1Macro ldff1,
Martyn Capewella5112342020-06-05 18:20:11 +01009822 T mod,
TatWai Chong113d9192020-05-19 01:02:36 -07009823 bool is_signed,
9824 bool is_scaled) {
9825 // SVE supports 32- and 64-bit addressing for gather loads.
9826 VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9827 static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9828
9829 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9830 START();
9831
9832 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
9833 int vl = config->sve_vl_in_bytes();
9834
9835 uint64_t addresses[kMaxLaneCount];
9836 uint64_t offsets[kMaxLaneCount];
9837 uint64_t max_address = 0;
9838 uint64_t buffer_size = vl * 64;
9839 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9840 // Fill the buffer with arbitrary data. Meanwhile, create the random addresses
9841 // and offsets into the buffer placed in the argument list.
9842 BufferFillingHelper(data,
9843 buffer_size,
9844 msize_in_bytes,
9845 kMaxLaneCount,
9846 offsets,
9847 addresses,
9848 &max_address);
9849
9850 ZRegister zn = z0.WithLaneSize(esize_in_bits);
9851 ZRegister zt_ref = z1.WithLaneSize(esize_in_bits);
Martyn Capewella5112342020-06-05 18:20:11 +01009852 ZRegister zt = z2.WithLaneSize(esize_in_bits);
9853 ZRegister zt_ff = z3.WithLaneSize(esize_in_bits);
9854 PRegisterWithLaneSize pg_ff = p1.WithLaneSize(esize_in_bits);
9855 PRegisterWithLaneSize pg_diff = p2.WithLaneSize(esize_in_bits);
TatWai Chong113d9192020-05-19 01:02:36 -07009856
9857 int shift = 0;
9858 if (is_scaled) {
9859 shift = std::log2(msize_in_bytes);
9860 for (unsigned i = 0; i < kMaxLaneCount; i++) {
9861 // Ensure the offsets are the multiple of the scale factor of the
9862 // operation.
9863 offsets[i] = (offsets[i] >> shift) << shift;
9864 addresses[i] = data + offsets[i];
9865 }
9866 }
9867
9868 PRegister all = p6;
9869 __ Ptrue(all.WithLaneSize(esize_in_bits));
9870
9871 PRegisterZ pg = p0.Zeroing();
9872 Initialise(&masm,
9873 pg,
9874 0x9abcdef012345678,
9875 0xabcdef0123456789,
9876 0xf4f3f1f0fefdfcfa,
9877 0xf9f8f6f5f3f2f1ff);
9878
9879 __ Mov(x0, data);
9880
9881 // Generate a reference result for scalar-plus-scalar form using scalar loads.
9882 ScalarLoadHelper(&masm,
9883 vl,
9884 addresses,
9885 zt_ref,
9886 pg,
9887 esize_in_bits,
9888 msize_in_bits,
9889 is_signed);
9890
9891 InsrHelper(&masm, zn, offsets);
9892 if (is_scaled) {
9893 // Scale down the offsets if testing scaled-offset operation.
9894 __ Lsr(zn, zn, shift);
9895 }
9896
Martyn Capewella5112342020-06-05 18:20:11 +01009897 (masm.*ld1)(zt, pg, SVEMemOperand(x0, zn, mod, shift));
TatWai Chong113d9192020-05-19 01:02:36 -07009898
TatWai Chong6537a9a2020-05-05 14:15:16 -07009899 Register ffr_check_count = x17;
9900 __ Mov(ffr_check_count, 0);
9901
TatWai Chong6537a9a2020-05-05 14:15:16 -07009902 // Test the data correctness in which the data gather load from different
9903 // addresses. The first-fault behavior test is emphasized in `Ldff1Helper`.
9904 __ Setffr();
Martyn Capewella5112342020-06-05 18:20:11 +01009905 (masm.*ldff1)(zt_ff, pg, SVEMemOperand(x0, zn, mod, shift));
9906
9907 // Compare these two vector register and place the different to
9908 // `ffr_check_count`.
9909 __ Rdffrs(pg_ff.VnB(), all.Zeroing());
9910 __ Cmpeq(pg_diff, all.Zeroing(), zt_ref, zt_ff);
9911 __ Eor(pg_diff.VnB(), all.Zeroing(), pg_diff.VnB(), pg_ff.VnB());
9912 __ Incp(ffr_check_count, pg_diff);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009913
TatWai Chong113d9192020-05-19 01:02:36 -07009914 END();
9915
9916 if (CAN_RUN()) {
9917 RUN();
9918
Martyn Capewella5112342020-06-05 18:20:11 +01009919 ASSERT_EQUAL_SVE(zt_ref, zt);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009920 ASSERT_EQUAL_64(0, ffr_check_count);
TatWai Chong113d9192020-05-19 01:02:36 -07009921 }
9922
9923 free(reinterpret_cast<void*>(data));
9924}
9925
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009926// Test gather loads by comparing them with the result of a set of equivalent
9927// scalar loads.
9928template <typename F>
TatWai Chong113d9192020-05-19 01:02:36 -07009929static void GatherLoadScalarPlusScalarOrImmHelper(Test* config,
9930 unsigned msize_in_bits,
9931 unsigned esize_in_bits,
9932 F sve_ld1,
9933 bool is_signed) {
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009934 // SVE supports 32- and 64-bit addressing for gather loads.
9935 VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9936 static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9937
9938 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9939 START();
9940
9941 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009942 int vl = config->sve_vl_in_bytes();
9943
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009944 uint64_t addresses[kMaxLaneCount];
9945 uint64_t offsets[kMaxLaneCount];
9946 uint64_t max_address = 0;
TatWai Chong85e15102020-05-04 21:00:40 -07009947 uint64_t buffer_size = vl * 64;
9948 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9949 BufferFillingHelper(data,
9950 buffer_size,
9951 msize_in_bytes,
9952 kMaxLaneCount,
9953 offsets,
9954 addresses,
9955 &max_address);
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009956
9957 // Maximised offsets, to ensure that the address calculation is modulo-2^64,
9958 // and that the vector addresses are not sign-extended.
9959 uint64_t uint_e_max = (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
9960 uint64_t maxed_offsets[kMaxLaneCount];
9961 uint64_t maxed_offsets_imm = max_address - uint_e_max;
9962 for (unsigned i = 0; i < kMaxLaneCount; i++) {
9963 maxed_offsets[i] = addresses[i] - maxed_offsets_imm;
9964 }
9965
9966 ZRegister zn = z0.WithLaneSize(esize_in_bits);
9967 ZRegister zt_addresses = z1.WithLaneSize(esize_in_bits);
9968 ZRegister zt_offsets = z2.WithLaneSize(esize_in_bits);
9969 ZRegister zt_maxed = z3.WithLaneSize(esize_in_bits);
9970 ZRegister zt_ref = z4.WithLaneSize(esize_in_bits);
9971
9972 PRegisterZ pg = p0.Zeroing();
9973 Initialise(&masm,
9974 pg,
9975 0x9abcdef012345678,
9976 0xabcdef0123456789,
9977 0xf4f3f1f0fefdfcfa,
9978 0xf9f8f6f5f3f2f0ff);
9979
9980 // Execute each load.
9981
9982 if (esize_in_bits == kDRegSize) {
9983 // Only test `addresses` if we can use 64-bit pointers. InsrHelper will fail
9984 // if any value won't fit in a lane of zn.
9985 InsrHelper(&masm, zn, addresses);
9986 (masm.*sve_ld1)(zt_addresses, pg, SVEMemOperand(zn));
9987 }
9988
9989 InsrHelper(&masm, zn, offsets);
9990 (masm.*sve_ld1)(zt_offsets, pg, SVEMemOperand(zn, data));
9991
9992 InsrHelper(&masm, zn, maxed_offsets);
9993 (masm.*sve_ld1)(zt_maxed, pg, SVEMemOperand(zn, maxed_offsets_imm));
9994
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009995 // Generate a reference result using scalar loads.
TatWai Chong85e15102020-05-04 21:00:40 -07009996 ScalarLoadHelper(&masm,
9997 vl,
9998 addresses,
9999 zt_ref,
10000 pg,
10001 esize_in_bits,
10002 msize_in_bits,
10003 is_signed);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010004
10005 END();
10006
10007 if (CAN_RUN()) {
10008 RUN();
10009
10010 if (esize_in_bits == kDRegSize) {
10011 ASSERT_EQUAL_SVE(zt_ref, zt_addresses);
10012 }
10013 ASSERT_EQUAL_SVE(zt_ref, zt_offsets);
10014 ASSERT_EQUAL_SVE(zt_ref, zt_maxed);
10015 }
10016
10017 free(reinterpret_cast<void*>(data));
10018}
10019
10020TEST_SVE(sve_ld1b_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010021 GatherLoadScalarPlusScalarOrImmHelper(config,
10022 kBRegSize,
10023 kDRegSize,
10024 &MacroAssembler::Ld1b,
10025 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010026}
10027
10028TEST_SVE(sve_ld1h_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010029 GatherLoadScalarPlusScalarOrImmHelper(config,
10030 kHRegSize,
10031 kDRegSize,
10032 &MacroAssembler::Ld1h,
10033 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010034}
10035
10036TEST_SVE(sve_ld1w_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010037 GatherLoadScalarPlusScalarOrImmHelper(config,
10038 kSRegSize,
10039 kDRegSize,
10040 &MacroAssembler::Ld1w,
10041 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010042}
10043
10044TEST_SVE(sve_ld1d_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010045 GatherLoadScalarPlusScalarOrImmHelper(config,
10046 kDRegSize,
10047 kDRegSize,
10048 &MacroAssembler::Ld1d,
10049 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010050}
10051
10052TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010053 GatherLoadScalarPlusScalarOrImmHelper(config,
10054 kBRegSize,
10055 kDRegSize,
10056 &MacroAssembler::Ld1sb,
10057 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010058}
10059
10060TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010061 GatherLoadScalarPlusScalarOrImmHelper(config,
10062 kHRegSize,
10063 kDRegSize,
10064 &MacroAssembler::Ld1sh,
10065 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010066}
10067
10068TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010069 GatherLoadScalarPlusScalarOrImmHelper(config,
10070 kSRegSize,
10071 kDRegSize,
10072 &MacroAssembler::Ld1sw,
10073 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010074}
10075
10076TEST_SVE(sve_ld1b_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010077 GatherLoadScalarPlusScalarOrImmHelper(config,
10078 kBRegSize,
10079 kSRegSize,
10080 &MacroAssembler::Ld1b,
10081 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010082}
10083
10084TEST_SVE(sve_ld1h_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010085 GatherLoadScalarPlusScalarOrImmHelper(config,
10086 kHRegSize,
10087 kSRegSize,
10088 &MacroAssembler::Ld1h,
10089 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010090}
10091
10092TEST_SVE(sve_ld1w_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010093 GatherLoadScalarPlusScalarOrImmHelper(config,
10094 kSRegSize,
10095 kSRegSize,
10096 &MacroAssembler::Ld1w,
10097 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010098}
10099
10100TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010101 GatherLoadScalarPlusScalarOrImmHelper(config,
10102 kBRegSize,
10103 kSRegSize,
10104 &MacroAssembler::Ld1sb,
10105 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010106}
10107
10108TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010109 GatherLoadScalarPlusScalarOrImmHelper(config,
10110 kHRegSize,
10111 kSRegSize,
10112 &MacroAssembler::Ld1sh,
10113 true);
10114}
10115
Martyn Capewella5112342020-06-05 18:20:11 +010010116TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset) {
10117 auto ld1_32_scaled_offset_helper =
10118 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10119 config,
10120 std::placeholders::_1,
10121 kSRegSize,
10122 std::placeholders::_2,
10123 std::placeholders::_3,
10124 std::placeholders::_4,
10125 std::placeholders::_5,
10126 true);
10127
10128 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10129 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10130 ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10131 ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10132
10133 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10134 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10135 ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10136 ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10137
10138 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10139 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10140 ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10141 ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010142}
10143
Martyn Capewella5112342020-06-05 18:20:11 +010010144TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset) {
10145 auto ld1_32_unscaled_offset_helper =
10146 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10147 config,
10148 std::placeholders::_1,
10149 kSRegSize,
10150 std::placeholders::_2,
10151 std::placeholders::_3,
10152 std::placeholders::_4,
10153 std::placeholders::_5,
10154 false);
TatWai Chong113d9192020-05-19 01:02:36 -070010155
Martyn Capewella5112342020-06-05 18:20:11 +010010156 Ld1Macro ld1b = &MacroAssembler::Ld1b;
10157 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10158 ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, UXTW, false);
10159 ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, SXTW, false);
10160
10161 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10162 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10163 ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10164 ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10165
10166 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10167 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10168 ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10169 ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10170
10171 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10172 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10173 ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, UXTW, true);
10174 ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, SXTW, true);
10175
10176 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10177 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10178 ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10179 ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010180}
10181
Martyn Capewella5112342020-06-05 18:20:11 +010010182TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset) {
10183 auto ld1_32_unpacked_scaled_offset_helper =
10184 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10185 config,
10186 std::placeholders::_1,
10187 kDRegSize,
10188 std::placeholders::_2,
10189 std::placeholders::_3,
10190 std::placeholders::_4,
10191 std::placeholders::_5,
10192 true);
TatWai Chong113d9192020-05-19 01:02:36 -070010193
Martyn Capewella5112342020-06-05 18:20:11 +010010194 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10195 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10196 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10197 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10198
10199 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10200 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10201 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10202 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10203
10204 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10205 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10206 ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10207 ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10208
10209 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10210 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10211 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10212 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10213
10214 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10215 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10216 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10217 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010218}
10219
Martyn Capewella5112342020-06-05 18:20:11 +010010220TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset) {
10221 auto ld1_32_unpacked_unscaled_offset_helper =
10222 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10223 config,
10224 std::placeholders::_1,
10225 kDRegSize,
10226 std::placeholders::_2,
10227 std::placeholders::_3,
10228 std::placeholders::_4,
10229 std::placeholders::_5,
10230 false);
10231
10232 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10233 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10234 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10235 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10236
10237 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10238 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10239 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10240 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10241
10242 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10243 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10244 ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10245 ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10246
10247 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10248 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10249 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10250 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10251
10252 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10253 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10254 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10255 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010256}
10257
Martyn Capewella5112342020-06-05 18:20:11 +010010258TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset) {
10259 auto ld1_64_scaled_offset_helper =
10260 std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10261 config,
10262 std::placeholders::_1,
10263 kDRegSize,
10264 std::placeholders::_2,
10265 std::placeholders::_3,
10266 LSL,
10267 std::placeholders::_4,
10268 true);
TatWai Chong113d9192020-05-19 01:02:36 -070010269
Martyn Capewella5112342020-06-05 18:20:11 +010010270 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10271 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10272 ld1_64_scaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10273
10274 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10275 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10276 ld1_64_scaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10277
10278 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10279 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10280 ld1_64_scaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10281
10282 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10283 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10284 ld1_64_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10285
10286 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10287 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10288 ld1_64_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
10289}
10290
10291TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset) {
10292 auto ld1_64_unscaled_offset_helper =
10293 std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10294 config,
10295 std::placeholders::_1,
10296 kDRegSize,
10297 std::placeholders::_2,
10298 std::placeholders::_3,
10299 NO_SHIFT,
10300 std::placeholders::_4,
10301 false);
10302
10303 Ld1Macro ld1b = &MacroAssembler::Ld1b;
10304 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10305 ld1_64_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, false);
10306
10307 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10308 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10309 ld1_64_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10310
10311 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10312 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10313 ld1_64_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10314
10315 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10316 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10317 ld1_64_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10318
10319 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10320 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10321 ld1_64_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, true);
10322
10323 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10324 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10325 ld1_64_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10326
10327 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10328 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10329 ld1_64_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010330}
10331
Martyn Capewell72765d12020-03-23 14:25:53 +000010332TEST_SVE(sve_ldnt1) {
10333 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10334 START();
10335
10336 int data_size = kZRegMaxSizeInBytes * 16;
10337 uint8_t* data = new uint8_t[data_size];
10338 for (int i = 0; i < data_size; i++) {
10339 data[i] = i & 0xff;
10340 }
10341
10342 // Set the base half-way through the buffer so we can use negative indices.
10343 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10344 __ Ptrue(p0.VnB());
10345 __ Punpklo(p1.VnH(), p0.VnB());
10346 __ Punpklo(p2.VnH(), p1.VnB());
10347 __ Punpklo(p3.VnH(), p2.VnB());
10348 __ Punpklo(p4.VnH(), p3.VnB());
10349
10350 __ Mov(x1, 42);
10351 __ Ld1b(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10352 __ Ldnt1b(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10353
10354 __ Mov(x1, -21);
10355 __ Ld1h(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10356 __ Ldnt1h(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10357
10358 __ Mov(x1, 10);
10359 __ Ld1w(z4.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10360 __ Ldnt1w(z5.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10361
10362 __ Mov(x1, -5);
10363 __ Ld1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10364 __ Ldnt1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10365
10366 __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10367 __ Ldnt1b(z9.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10368
10369 __ Ld1h(z10.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10370 __ Ldnt1h(z11.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10371
10372 __ Ld1w(z12.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10373 __ Ldnt1w(z13.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10374
10375 __ Ld1d(z14.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10376 __ Ldnt1d(z15.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10377 END();
10378
10379 if (CAN_RUN()) {
10380 RUN();
10381 ASSERT_EQUAL_SVE(z0, z1);
10382 ASSERT_EQUAL_SVE(z2, z3);
10383 ASSERT_EQUAL_SVE(z4, z5);
10384 ASSERT_EQUAL_SVE(z6, z7);
10385 ASSERT_EQUAL_SVE(z8, z9);
10386 ASSERT_EQUAL_SVE(z10, z11);
10387 ASSERT_EQUAL_SVE(z12, z13);
10388 ASSERT_EQUAL_SVE(z14, z15);
10389 }
10390}
10391
Martyn Capewell3e2fb502020-03-24 12:04:07 +000010392TEST_SVE(sve_stnt1) {
10393 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10394 START();
10395
10396 int data_size = kZRegMaxSizeInBytes * 16;
10397 uint8_t* data = new uint8_t[data_size];
10398
10399 // Set the base half-way through the buffer so we can use negative indices.
10400 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10401 __ Ptrue(p0.VnB());
10402 __ Punpklo(p1.VnH(), p0.VnB());
10403 __ Punpklo(p2.VnH(), p1.VnB());
10404 __ Punpklo(p3.VnH(), p2.VnB());
10405 __ Punpklo(p4.VnH(), p3.VnB());
10406 __ Dup(z0.VnB(), 0x55);
10407 __ Index(z1.VnB(), 0, 1);
10408
10409 // Store with all-true and patterned predication, load back, and create a
10410 // reference value for later comparison.
10411 __ Rdvl(x1, 1);
10412 __ Stnt1b(z0.VnB(), p0, SVEMemOperand(x0, x1));
10413 __ Stnt1b(z1.VnB(), p1, SVEMemOperand(x0, 1, SVE_MUL_VL));
10414 __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1));
10415 __ Sel(z3.VnB(), p1, z1.VnB(), z0.VnB());
10416
10417 // Repeated, with wider elements and different offsets.
10418 __ Rdvl(x1, -1);
10419 __ Lsr(x1, x1, 1);
10420 __ Stnt1h(z0.VnH(), p0, SVEMemOperand(x0, x1, LSL, 1));
10421 __ Stnt1h(z1.VnH(), p2, SVEMemOperand(x0, -1, SVE_MUL_VL));
10422 __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10423 __ Sel(z5.VnH(), p2, z1.VnH(), z0.VnH());
10424
10425 __ Rdvl(x1, 7);
10426 __ Lsr(x1, x1, 2);
10427 __ Stnt1w(z0.VnS(), p0, SVEMemOperand(x0, x1, LSL, 2));
10428 __ Stnt1w(z1.VnS(), p3, SVEMemOperand(x0, 7, SVE_MUL_VL));
10429 __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10430 __ Sel(z7.VnS(), p3, z1.VnS(), z0.VnS());
10431
10432 __ Rdvl(x1, -8);
10433 __ Lsr(x1, x1, 3);
10434 __ Stnt1d(z0.VnD(), p0, SVEMemOperand(x0, x1, LSL, 3));
10435 __ Stnt1d(z1.VnD(), p4, SVEMemOperand(x0, -8, SVE_MUL_VL));
10436 __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10437 __ Sel(z9.VnD(), p4, z1.VnD(), z0.VnD());
10438 END();
10439
10440 if (CAN_RUN()) {
10441 RUN();
10442 ASSERT_EQUAL_SVE(z2, z3);
10443 ASSERT_EQUAL_SVE(z4, z5);
10444 ASSERT_EQUAL_SVE(z6, z7);
10445 ASSERT_EQUAL_SVE(z8, z9);
10446 }
10447}
10448
Martyn Capewell452ad8b2020-03-19 15:49:57 +000010449TEST_SVE(sve_ld1rq) {
10450 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10451 START();
10452
10453 int data_size = (kQRegSizeInBytes + 128) * 2;
10454 uint8_t* data = new uint8_t[data_size];
10455 for (int i = 0; i < data_size; i++) {
10456 data[i] = i & 0xff;
10457 }
10458
10459 // Set the base half-way through the buffer so we can use negative indices.
10460 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10461
10462 __ Index(z0.VnB(), 0, 1);
10463 __ Ptrue(p0.VnB());
10464 __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
10465 __ Pfalse(p1.VnB());
10466 __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
10467
10468 // Load and broadcast using scalar offsets.
10469 __ Mov(x1, -42);
10470 __ Ld1rqb(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10471
10472 __ Add(x2, x0, 1);
10473 __ Mov(x1, -21);
10474 __ Punpklo(p2.VnH(), p1.VnB());
10475 __ Ld1rqh(z1.VnH(), p2.Zeroing(), SVEMemOperand(x2, x1, LSL, 1));
10476
10477 __ Add(x2, x2, 1);
10478 __ Mov(x1, -10);
10479 __ Punpklo(p3.VnH(), p2.VnB());
10480 __ Ld1rqw(z2.VnS(), p3.Zeroing(), SVEMemOperand(x2, x1, LSL, 2));
10481
10482 __ Add(x2, x2, 1);
10483 __ Mov(x1, 5);
10484 __ Punpklo(p4.VnH(), p3.VnB());
10485 __ Ld1rqd(z3.VnD(), p4.Zeroing(), SVEMemOperand(x2, x1, LSL, 3));
10486
10487 // Check that all segments match by rotating the vector by one segment,
10488 // eoring, and orring across the vector.
10489 __ Ext(z4.VnB(), z0.VnB(), z0.VnB(), 16);
10490 __ Eor(z4.VnB(), z4.VnB(), z0.VnB());
10491 __ Orv(b4, p0, z4.VnB());
10492 __ Ext(z5.VnB(), z1.VnB(), z1.VnB(), 16);
10493 __ Eor(z5.VnB(), z5.VnB(), z1.VnB());
10494 __ Orv(b5, p0, z5.VnB());
10495 __ Orr(z4, z4, z5);
10496 __ Ext(z5.VnB(), z2.VnB(), z2.VnB(), 16);
10497 __ Eor(z5.VnB(), z5.VnB(), z2.VnB());
10498 __ Orv(b5, p0, z5.VnB());
10499 __ Orr(z4, z4, z5);
10500 __ Ext(z5.VnB(), z3.VnB(), z3.VnB(), 16);
10501 __ Eor(z5.VnB(), z5.VnB(), z3.VnB());
10502 __ Orv(b5, p0, z5.VnB());
10503 __ Orr(z4, z4, z5);
10504
10505 // Load and broadcast the same values, using immediate offsets.
10506 __ Add(x1, x0, 6);
10507 __ Ld1rqb(z5.VnB(), p1.Zeroing(), SVEMemOperand(x1, -48));
10508 __ Add(x1, x0, -9);
10509 __ Ld1rqh(z6.VnH(), p2.Zeroing(), SVEMemOperand(x1, -32));
10510 __ Add(x1, x0, -70);
10511 __ Ld1rqw(z7.VnS(), p3.Zeroing(), SVEMemOperand(x1, 32));
10512 __ Add(x1, x0, 27);
10513 __ Ld1rqd(z8.VnD(), p4.Zeroing(), SVEMemOperand(x1, 16));
10514 END();
10515
10516 if (CAN_RUN()) {
10517 RUN();
10518 uint64_t expected_z0[] = {0x0000000000000000, 0x006c006a00680066};
10519 uint64_t expected_z1[] = {0x000074730000706f, 0x00006c6b00006867};
10520 uint64_t expected_z2[] = {0x0000000075747372, 0x000000006d6c6b6a};
10521 uint64_t expected_z3[] = {0x0000000000000000, 0xc2c1c0bfbebdbcbb};
10522 uint64_t expected_z4[] = {0, 0};
10523 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
10524 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
10525 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
10526 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
10527 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
10528 ASSERT_EQUAL_SVE(z0, z5);
10529 ASSERT_EQUAL_SVE(z1, z6);
10530 ASSERT_EQUAL_SVE(z2, z7);
10531 ASSERT_EQUAL_SVE(z3, z8);
10532 }
10533}
10534
Martyn Capewellb56cf222020-05-05 17:38:28 +010010535TEST_SVE(sve_st1_vec_imm) {
10536 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
10537 START();
10538
10539 // TODO: Use mmap() to request a buffer in the low 4GB, which allows testing
10540 // 32-bit address vectors.
10541 int data_size = kZRegMaxSizeInBytes * 16;
10542 uint8_t* data = new uint8_t[data_size];
10543
10544 // Set the base to 16 bytes from the end of the buffer so we can use negative
10545 // indices.
10546 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size - 16]));
10547 __ Ptrue(p0.VnB());
10548
10549 // Store a vector of index values in reverse order, using
10550 // vector-plus-immediate addressing to begin at byte 15, then storing to
10551 // bytes 14, 13, etc.
10552 __ Index(z1.VnD(), x0, -1);
10553 __ Index(z2.VnD(), 0, 1);
10554
10555 // Iterate in order to store at least 16 bytes. The number of iterations
10556 // depends on VL, eg. VL128 iterates eight times, storing bytes 15 and 14
10557 // on the first iteration, 13 and 12 on the next, etc.
10558 uint64_t dlanes = config->sve_vl_in_bytes() / kDRegSizeInBytes;
10559 for (int i = 15; i >= 0; i -= dlanes * kBRegSizeInBytes) {
10560 __ St1b(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10561 __ Incd(z2.VnD());
10562 }
10563
10564 // Reload the stored data, and build a reference for comparison. The reference
10565 // is truncated to a Q register, as only the least-significant 128 bits are
10566 // checked.
10567 __ Ldr(q4, MemOperand(x0));
10568 __ Index(z5.VnB(), 15, -1);
10569 __ Mov(q5, q5);
10570
10571 // Repeat for wider elements.
10572 __ Index(z1.VnD(), x0, -2); // Stepping by -2 for H-sized elements.
10573 __ Index(z2.VnD(), 0, 1);
10574 for (int i = 14; i >= 0; i -= dlanes * kHRegSizeInBytes) {
10575 __ St1h(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10576 __ Incd(z2.VnD());
10577 }
10578 __ Ldr(q6, MemOperand(x0));
10579 __ Index(z7.VnH(), 7, -1);
10580 __ Mov(q7, q7);
10581
10582 __ Index(z1.VnD(), x0, -4); // Stepping by -4 for S-sized elements.
10583 __ Index(z2.VnD(), 0, 1);
10584 for (int i = 12; i >= 0; i -= dlanes * kSRegSizeInBytes) {
10585 __ St1w(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10586 __ Incd(z2.VnD());
10587 }
10588 __ Ldr(q8, MemOperand(x0));
10589 __ Index(z9.VnS(), 3, -1);
10590 __ Mov(q9, q9);
10591
10592 __ Index(z1.VnD(), x0, -8); // Stepping by -8 for D-sized elements.
10593 __ Index(z2.VnD(), 0, 1);
10594 for (int i = 8; i >= 0; i -= dlanes * kDRegSizeInBytes) {
10595 __ St1d(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10596 __ Incd(z2.VnD());
10597 }
10598 __ Ldr(q10, MemOperand(x0));
10599 __ Index(z11.VnD(), 1, -1);
10600 __ Mov(q11, q11);
10601
10602 // Test predication by storing even halfwords to memory (using predication)
10603 // at byte-separated addresses. The result should be the same as storing
10604 // even halfwords contiguously to memory.
10605 __ Pfalse(p1.VnB());
10606 __ Zip1(p1.VnD(), p0.VnD(), p1.VnD());
10607 __ Mov(x0, reinterpret_cast<uintptr_t>(data));
10608 __ Index(z1.VnD(), x0, 1);
10609 __ Index(z2.VnD(), 0x1000, 1);
10610 for (int i = 0; i < 16; i += dlanes) {
10611 __ St1h(z2.VnD(), p1, SVEMemOperand(z1.VnD(), i));
10612 __ Incd(z2.VnD());
10613 }
10614 __ Ldr(q2, MemOperand(x0));
10615 __ Index(z3.VnH(), 0x1000, 2);
10616 __ Mov(q3, q3);
10617
10618 END();
10619
10620 if (CAN_RUN()) {
10621 RUN();
10622
10623 ASSERT_EQUAL_SVE(z3, z2);
10624 ASSERT_EQUAL_SVE(z5, z4);
10625 ASSERT_EQUAL_SVE(z7, z6);
10626 ASSERT_EQUAL_SVE(z9, z8);
10627 ASSERT_EQUAL_SVE(z11, z10);
10628 }
10629}
10630
TatWai Chong5f3928c2020-06-11 00:09:20 -070010631template <typename T>
10632static void sve_st1_scalar_plus_vector_helper(Test* config,
10633 int esize_in_bits,
10634 T mod,
10635 bool is_scaled) {
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010636 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10637 START();
10638
10639 int vl = config->sve_vl_in_bytes();
10640 int data_size = vl * 160;
10641 uint8_t* data = new uint8_t[data_size];
10642 memset(data, 0, data_size);
TatWai Chong5f3928c2020-06-11 00:09:20 -070010643 int vl_per_esize = vl / (esize_in_bits / kBitsPerByte);
10644
10645 ZRegister zn_b = z0.WithLaneSize(esize_in_bits);
10646 ZRegister zn_h = z1.WithLaneSize(esize_in_bits);
10647 ZRegister zn_s = z2.WithLaneSize(esize_in_bits);
10648 ZRegister zn_d = z3.WithLaneSize(esize_in_bits);
10649
10650 ZRegister zn_ld_b = z10.WithLaneSize(esize_in_bits);
10651 ZRegister zn_ld_h = z11.WithLaneSize(esize_in_bits);
10652 ZRegister zn_ld_s = z12.WithLaneSize(esize_in_bits);
10653 ZRegister zn_ld_d = z13.WithLaneSize(esize_in_bits);
10654 ZRegister offsets = z31.WithLaneSize(esize_in_bits);
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010655
10656 // Set the base half-way through the buffer so we can use negative indices.
10657 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
TatWai Chong5f3928c2020-06-11 00:09:20 -070010658 __ Ptrue(p6.WithLaneSize(esize_in_bits));
10659 __ Pfalse(p7.WithLaneSize(esize_in_bits));
10660 __ Zip1(p0.WithLaneSize(esize_in_bits),
10661 p6.WithLaneSize(esize_in_bits),
10662 p7.WithLaneSize(esize_in_bits));
10663 __ Zip1(p1.WithLaneSize(esize_in_bits),
10664 p7.WithLaneSize(esize_in_bits),
10665 p6.WithLaneSize(esize_in_bits));
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010666
TatWai Chong5f3928c2020-06-11 00:09:20 -070010667 // `st1b` doesn't have the scaled-offset forms.
10668 if (is_scaled == false) {
10669 // Simply stepping the index by 2 to simulate a scatter memory access.
10670 __ Index(offsets, 1, 2);
10671 __ St1b(offsets, p0, SVEMemOperand(x0, offsets, mod));
10672 __ Ld1b(zn_ld_b, p0.Zeroing(), SVEMemOperand(x0, offsets, mod));
10673 __ Dup(zn_b, 0);
10674 __ Mov(zn_b, p0.Merging(), offsets);
10675 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010676
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010677 // Store the values to isolated range different with other stores.
TatWai Chong5f3928c2020-06-11 00:09:20 -070010678 int scale = is_scaled ? 1 : 0;
10679 __ Add(x1, x0, vl_per_esize * 4);
10680 __ Index(offsets, 6, 4);
10681 __ St1h(offsets, p0, SVEMemOperand(x1, offsets, mod, scale));
10682 __ Ld1h(zn_ld_h, p0.Zeroing(), SVEMemOperand(x1, offsets, mod, scale));
10683 __ Dup(zn_h, 0);
10684 __ Mov(zn_h, p0.Merging(), offsets);
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010685
TatWai Chong5f3928c2020-06-11 00:09:20 -070010686 scale = is_scaled ? 2 : 0;
10687 __ Add(x2, x0, UINT64_MAX + (vl_per_esize * -8) + 1);
10688 __ Index(offsets, 64, 8);
10689 if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10690 (static_cast<int>(mod) == SXTW)) {
10691 // Testing negative offsets.
10692 __ Neg(offsets, p6.Merging(), offsets);
10693 }
10694 __ St1w(offsets, p1, SVEMemOperand(x2, offsets, mod, scale));
10695 __ Ld1w(zn_ld_s, p1.Zeroing(), SVEMemOperand(x2, offsets, mod, scale));
10696 __ Dup(zn_s, 0);
10697 __ Mov(zn_s, p1.Merging(), offsets);
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010698
TatWai Chong5f3928c2020-06-11 00:09:20 -070010699 if (esize_in_bits == kDRegSize) {
10700 // Test st1w by comparing the 32-bit value loaded correspondingly with the
10701 // 32-bit value stored.
10702 __ Lsl(zn_s, zn_s, kSRegSize);
10703 __ Lsr(zn_s, zn_s, kSRegSize);
10704 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010705
TatWai Chong5f3928c2020-06-11 00:09:20 -070010706 // `st1d` doesn't have the S-sized lane forms.
10707 if (esize_in_bits == kDRegSize) {
10708 scale = is_scaled ? 3 : 0;
10709 __ Add(x3, x0, UINT64_MAX + (vl_per_esize * -16) + 1);
10710 __ Index(offsets, 128, 16);
10711 if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10712 (static_cast<int>(mod) == SXTW)) {
10713 __ Neg(offsets, p6.Merging(), offsets);
10714 }
10715 __ St1d(offsets, p1, SVEMemOperand(x3, offsets, mod, scale));
10716 __ Ld1d(zn_ld_d, p1.Zeroing(), SVEMemOperand(x3, offsets, mod, scale));
10717 __ Dup(zn_d, 0);
10718 __ Mov(zn_d, p1.Merging(), offsets);
10719 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010720
10721 END();
10722
10723 if (CAN_RUN()) {
10724 RUN();
10725
TatWai Chong5f3928c2020-06-11 00:09:20 -070010726 if (scale == false) {
10727 ASSERT_EQUAL_SVE(zn_ld_b, zn_b);
10728 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010729
TatWai Chong5f3928c2020-06-11 00:09:20 -070010730 ASSERT_EQUAL_SVE(zn_ld_h, zn_h);
10731 ASSERT_EQUAL_SVE(zn_ld_s, zn_s);
10732
10733 if (esize_in_bits == kDRegSize) {
10734 ASSERT_EQUAL_SVE(zn_ld_d, zn_d);
10735 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010736 }
10737
10738 delete[] data;
10739}
10740
TatWai Chong5f3928c2020-06-11 00:09:20 -070010741TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled) {
10742 sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, false);
10743 sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, false);
10744}
10745
10746TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled) {
10747 sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, true);
10748 sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, true);
10749}
10750
10751TEST_SVE(sve_st1_sca_vec_32_unscaled) {
10752 sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, false);
10753 sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, false);
10754}
10755
10756TEST_SVE(sve_st1_sca_vec_32_scaled) {
10757 sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, true);
10758 sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, true);
10759}
10760
10761TEST_SVE(sve_st1_sca_vec_64_scaled) {
10762 sve_st1_scalar_plus_vector_helper(config, kDRegSize, LSL, true);
10763}
10764
10765TEST_SVE(sve_st1_sca_vec_64_unscaled) {
10766 sve_st1_scalar_plus_vector_helper(config, kDRegSize, NO_SHIFT, false);
10767}
10768
TatWai Chong6995bfd2019-09-26 10:48:05 +010010769typedef void (MacroAssembler::*IntWideImmFn)(const ZRegister& zd,
10770 const ZRegister& zn,
10771 const IntegerOperand imm);
10772
10773template <typename F, typename Td, typename Tn>
10774static void IntWideImmHelper(Test* config,
10775 F macro,
10776 unsigned lane_size_in_bits,
10777 const Tn& zn_inputs,
10778 IntegerOperand imm,
10779 const Td& zd_expected) {
10780 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10781 START();
10782
10783 ZRegister zd1 = z0.WithLaneSize(lane_size_in_bits);
10784 InsrHelper(&masm, zd1, zn_inputs);
10785
10786 // Also test with a different zn, to test the movprfx case.
10787 ZRegister zn = z1.WithLaneSize(lane_size_in_bits);
10788 InsrHelper(&masm, zn, zn_inputs);
10789 ZRegister zd2 = z2.WithLaneSize(lane_size_in_bits);
10790 ZRegister zn_copy = z3.WithSameLaneSizeAs(zn);
10791
10792 // Make a copy so we can check that constructive operations preserve zn.
10793 __ Mov(zn_copy, zn);
10794
10795 {
10796 UseScratchRegisterScope temps(&masm);
10797 // The MacroAssembler needs a P scratch register for some of these macros,
10798 // and it doesn't have one by default.
10799 temps.Include(p3);
10800
10801 (masm.*macro)(zd1, zd1, imm);
10802 (masm.*macro)(zd2, zn, imm);
10803 }
10804
10805 END();
10806
10807 if (CAN_RUN()) {
10808 RUN();
10809
10810 ASSERT_EQUAL_SVE(zd_expected, zd1);
10811
10812 // Check the result from `instr` with movprfx is the same as
10813 // the immediate version.
10814 ASSERT_EQUAL_SVE(zd_expected, zd2);
10815
10816 ASSERT_EQUAL_SVE(zn_copy, zn);
10817 }
10818}
10819
10820TEST_SVE(sve_int_wide_imm_unpredicated_smax) {
10821 int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10822 int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10823 int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10824 int64_t in_d[] = {1, 10, 10000, 1000000};
10825
10826 IntWideImmFn fn = &MacroAssembler::Smax;
10827
10828 int exp_b_1[] = {0, -1, 127, -1, 126, 1, -1, 55};
10829 int exp_h_1[] = {127, 127, 127, 127, INT16_MAX, 127, 127, 5555};
10830 int exp_s_1[] = {0, -128, 127, -128, INT32_MAX, 1, -1, 555555};
10831 int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10832
10833 IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10834 IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10835 IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10836 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10837
10838 int exp_h_2[] = {0, -128, 127, -255, INT16_MAX, 1, -1, 5555};
10839 int exp_s_2[] = {2048, 2048, 2048, 2048, INT32_MAX, 2048, 2048, 555555};
10840 int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10841
10842 // The immediate is in the range [-128, 127], but the macro is able to
10843 // synthesise unencodable immediates.
10844 // B-sized lanes cannot take an immediate out of the range [-128, 127].
10845 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10846 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10847 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10848}
10849
10850TEST_SVE(sve_int_wide_imm_unpredicated_smin) {
10851 int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10852 int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10853 int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10854 int64_t in_d[] = {1, 10, 10000, 1000000};
10855
10856 IntWideImmFn fn = &MacroAssembler::Smin;
10857
10858 int exp_b_1[] = {-1, -128, -1, -127, -1, -1, -1, -1};
10859 int exp_h_1[] = {0, -128, 127, INT16_MIN, 127, 1, -1, 127};
10860 int exp_s_1[] = {-128, -128, -128, INT32_MIN, -128, -128, -128, -128};
10861 int64_t exp_d_1[] = {1, 10, 99, 99};
10862
10863 IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10864 IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10865 IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10866 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10867
10868 int exp_h_2[] = {-255, -255, -255, INT16_MIN, -255, -255, -255, -255};
10869 int exp_s_2[] = {0, -128, 127, INT32_MIN, 2048, 1, -1, 2048};
10870 int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10871
10872 // The immediate is in the range [-128, 127], but the macro is able to
10873 // synthesise unencodable immediates.
10874 // B-sized lanes cannot take an immediate out of the range [-128, 127].
10875 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10876 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10877 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10878}
10879
10880TEST_SVE(sve_int_wide_imm_unpredicated_umax) {
10881 int in_b[] = {0, 255, 127, 0x80, 1, 55};
10882 int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10883 int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10884 int64_t in_d[] = {1, 10, 10000, 1000000};
10885
10886 IntWideImmFn fn = &MacroAssembler::Umax;
10887
10888 int exp_b_1[] = {17, 255, 127, 0x80, 17, 55};
10889 int exp_h_1[] = {127, 255, 127, INT16_MAX, 127, 5555};
10890 int exp_s_1[] = {255, 255, 255, INT32_MAX, 255, 555555};
10891 int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10892
10893 IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10894 IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10895 IntWideImmHelper(config, fn, kSRegSize, in_s, 0xff, exp_s_1);
10896 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10897
10898 int exp_h_2[] = {511, 511, 511, INT16_MAX, 511, 5555};
10899 int exp_s_2[] = {2048, 2048, 2048, INT32_MAX, 2048, 555555};
10900 int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10901
10902 // The immediate is in the range [0, 255], but the macro is able to
10903 // synthesise unencodable immediates.
10904 // B-sized lanes cannot take an immediate out of the range [0, 255].
10905 IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10906 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10907 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10908}
10909
10910TEST_SVE(sve_int_wide_imm_unpredicated_umin) {
10911 int in_b[] = {0, 255, 127, 0x80, 1, 55};
10912 int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10913 int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10914 int64_t in_d[] = {1, 10, 10000, 1000000};
10915
10916 IntWideImmFn fn = &MacroAssembler::Umin;
10917
10918 int exp_b_1[] = {0, 17, 17, 17, 1, 17};
10919 int exp_h_1[] = {0, 127, 127, 127, 1, 127};
10920 int exp_s_1[] = {0, 255, 127, 255, 1, 255};
10921 int64_t exp_d_1[] = {1, 10, 99, 99};
10922
10923 IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10924 IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10925 IntWideImmHelper(config, fn, kSRegSize, in_s, 255, exp_s_1);
10926 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10927
10928 int exp_h_2[] = {0, 255, 127, 511, 1, 511};
10929 int exp_s_2[] = {0, 255, 127, 2048, 1, 2048};
10930 int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10931
10932 // The immediate is in the range [0, 255], but the macro is able to
10933 // synthesise unencodable immediates.
10934 // B-sized lanes cannot take an immediate out of the range [0, 255].
10935 IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10936 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10937 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10938}
10939
10940TEST_SVE(sve_int_wide_imm_unpredicated_mul) {
10941 int in_b[] = {11, -1, 7, -3};
10942 int in_h[] = {111, -1, 17, -123};
10943 int in_s[] = {11111, -1, 117, -12345};
10944 int64_t in_d[] = {0x7fffffff, 0x80000000};
10945
10946 IntWideImmFn fn = &MacroAssembler::Mul;
10947
10948 int exp_b_1[] = {66, -6, 42, -18};
10949 int exp_h_1[] = {-14208, 128, -2176, 15744};
10950 int exp_s_1[] = {11111 * 127, -127, 117 * 127, -12345 * 127};
10951 int64_t exp_d_1[] = {0xfffffffe, 0x100000000};
10952
10953 IntWideImmHelper(config, fn, kBRegSize, in_b, 6, exp_b_1);
10954 IntWideImmHelper(config, fn, kHRegSize, in_h, -128, exp_h_1);
10955 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
10956 IntWideImmHelper(config, fn, kDRegSize, in_d, 2, exp_d_1);
10957
10958 int exp_h_2[] = {-28305, 255, -4335, 31365};
10959 int exp_s_2[] = {22755328, -2048, 239616, -25282560};
10960 int64_t exp_d_2[] = {0x00000063ffffff38, 0x0000006400000000};
10961
10962 // The immediate is in the range [-128, 127], but the macro is able to
10963 // synthesise unencodable immediates.
10964 // B-sized lanes cannot take an immediate out of the range [0, 255].
10965 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10966 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10967 IntWideImmHelper(config, fn, kDRegSize, in_d, 200, exp_d_2);
10968
10969 // Integer overflow on multiplication.
10970 unsigned exp_b_3[] = {0x75, 0x81, 0x79, 0x83};
10971
10972 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x7f, exp_b_3);
10973}
10974
10975TEST_SVE(sve_int_wide_imm_unpredicated_add) {
10976 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
10977 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
10978 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
10979 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
10980
10981 IntWideImmFn fn = &MacroAssembler::Add;
10982
10983 unsigned exp_b_1[] = {0x02, 0x00, 0x91, 0x80};
10984 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
10985 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
10986 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
10987
10988 // Encodable with `add` (shift 0).
10989 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
10990 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
10991 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
10992 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
10993
10994 unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
10995 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
10996 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
10997
10998 // Encodable with `add` (shift 8).
10999 // B-sized lanes cannot take a shift of 8.
11000 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11001 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11002 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11003
11004 unsigned exp_s_3[] = {0x80808181, 0x807e7f7f, 0xab29aaaa, 0xf07ff0f0};
11005
11006 // The macro is able to synthesise unencodable immediates.
11007 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
Jacob Bramleyd9f929c2019-10-02 11:42:56 +010011008
11009 unsigned exp_b_4[] = {0x61, 0x5f, 0xf0, 0xdf};
11010 unsigned exp_h_4[] = {0x6181, 0x5f7f, 0xf010, 0x8aaa};
11011 unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11012 uint64_t exp_d_4[] = {0x8000000180018180, 0x7fffffff7fff7f7e};
11013
11014 // Negative immediates use `sub`.
11015 IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11016 IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11017 IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11018 IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011019}
11020
11021TEST_SVE(sve_int_wide_imm_unpredicated_sqadd) {
11022 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11023 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11024 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11025 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11026
11027 IntWideImmFn fn = &MacroAssembler::Sqadd;
11028
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011029 unsigned exp_b_1[] = {0x02, 0x7f, 0x7f, 0x7f};
TatWai Chong6995bfd2019-09-26 10:48:05 +010011030 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11031 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11032 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11033
11034 // Encodable with `sqadd` (shift 0).
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011035 // Note that encodable immediates are unsigned, even for signed saturation.
11036 IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011037 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11038 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011039 IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011040
11041 unsigned exp_h_2[] = {0x9181, 0x7fff, 0x2010, 0xbaaa};
11042 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11043 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11044
11045 // Encodable with `sqadd` (shift 8).
11046 // B-sized lanes cannot take a shift of 8.
11047 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11048 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11049 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011050}
11051
11052TEST_SVE(sve_int_wide_imm_unpredicated_uqadd) {
11053 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11054 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11055 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11056 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11057
11058 IntWideImmFn fn = &MacroAssembler::Uqadd;
11059
11060 unsigned exp_b_1[] = {0xff, 0xff, 0x91, 0xff};
11061 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11062 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11063 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11064
11065 // Encodable with `uqadd` (shift 0).
11066 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11067 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11068 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11069 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11070
11071 unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
11072 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11073 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11074
11075 // Encodable with `uqadd` (shift 8).
11076 // B-sized lanes cannot take a shift of 8.
11077 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11078 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11079 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011080}
11081
11082TEST_SVE(sve_int_wide_imm_unpredicated_sub) {
11083 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11084 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11085 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11086 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11087
11088 IntWideImmFn fn = &MacroAssembler::Sub;
11089
11090 unsigned exp_b_1[] = {0x00, 0xfe, 0x8f, 0x7e};
11091 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11092 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11093 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11094
11095 // Encodable with `sub` (shift 0).
11096 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11097 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11098 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11099 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11100
11101 unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11102 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11103 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11104
11105 // Encodable with `sub` (shift 8).
11106 // B-sized lanes cannot take a shift of 8.
11107 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11108 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11109 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11110
11111 unsigned exp_s_3[] = {0x7f828181, 0x7f807f7f, 0xaa2baaaa, 0xef81f0f0};
11112
11113 // The macro is able to synthesise unencodable immediates.
11114 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
Jacob Bramleyd9f929c2019-10-02 11:42:56 +010011115
11116 unsigned exp_b_4[] = {0xa1, 0x9f, 0x30, 0x1f};
11117 unsigned exp_h_4[] = {0xa181, 0x9f7f, 0x3010, 0xcaaa};
11118 unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11119 uint64_t exp_d_4[] = {0x8000000180018182, 0x7fffffff7fff7f80};
11120
11121 // Negative immediates use `add`.
11122 IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11123 IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11124 IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11125 IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011126}
11127
11128TEST_SVE(sve_int_wide_imm_unpredicated_sqsub) {
11129 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11130 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11131 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11132 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11133
11134 IntWideImmFn fn = &MacroAssembler::Sqsub;
11135
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011136 unsigned exp_b_1[] = {0x80, 0xfe, 0x8f, 0x80};
TatWai Chong6995bfd2019-09-26 10:48:05 +010011137 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11138 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11139 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11140
11141 // Encodable with `sqsub` (shift 0).
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011142 // Note that encodable immediates are unsigned, even for signed saturation.
11143 IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011144 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11145 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011146 IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011147
11148 unsigned exp_h_2[] = {0x8000, 0x6f7f, 0x0010, 0x9aaa};
11149 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11150 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11151
11152 // Encodable with `sqsub` (shift 8).
11153 // B-sized lanes cannot take a shift of 8.
11154 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11155 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11156 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011157}
11158
11159TEST_SVE(sve_int_wide_imm_unpredicated_uqsub) {
11160 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11161 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11162 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11163 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11164
11165 IntWideImmFn fn = &MacroAssembler::Uqsub;
11166
11167 unsigned exp_b_1[] = {0x00, 0x00, 0x00, 0x7e};
11168 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11169 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11170 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11171
11172 // Encodable with `uqsub` (shift 0).
11173 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11174 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11175 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11176 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11177
11178 unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11179 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11180 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11181
11182 // Encodable with `uqsub` (shift 8).
11183 // B-sized lanes cannot take a shift of 8.
11184 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11185 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11186 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011187}
11188
11189TEST_SVE(sve_int_wide_imm_unpredicated_subr) {
11190 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11191 START();
11192
11193 // Encodable with `subr` (shift 0).
11194 __ Index(z0.VnD(), 1, 1);
11195 __ Sub(z0.VnD(), 100, z0.VnD());
11196 __ Index(z1.VnS(), 0x7f, 1);
11197 __ Sub(z1.VnS(), 0xf7, z1.VnS());
11198 __ Index(z2.VnH(), 0xaaaa, 0x2222);
11199 __ Sub(z2.VnH(), 0x80, z2.VnH());
11200 __ Index(z3.VnB(), 133, 1);
11201 __ Sub(z3.VnB(), 255, z3.VnB());
11202
11203 // Encodable with `subr` (shift 8).
11204 __ Index(z4.VnD(), 256, -1);
11205 __ Sub(z4.VnD(), 42 * 256, z4.VnD());
11206 __ Index(z5.VnS(), 0x7878, 1);
11207 __ Sub(z5.VnS(), 0x8000, z5.VnS());
11208 __ Index(z6.VnH(), 0x30f0, -1);
11209 __ Sub(z6.VnH(), 0x7f00, z6.VnH());
11210 // B-sized lanes cannot take a shift of 8.
11211
11212 // Select with movprfx.
11213 __ Index(z31.VnD(), 256, 4001);
11214 __ Sub(z7.VnD(), 42 * 256, z31.VnD());
11215
11216 // Out of immediate encodable range of `sub`.
11217 __ Index(z30.VnS(), 0x11223344, 1);
11218 __ Sub(z8.VnS(), 0x88776655, z30.VnS());
11219
11220 END();
11221
11222 if (CAN_RUN()) {
11223 RUN();
11224
11225 int expected_z0[] = {87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
11226 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
11227
11228 int expected_z1[] = {0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78};
11229 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
11230
11231 int expected_z2[] = {0xab2c, 0xcd4e, 0xef70, 0x1192, 0x33b4, 0x55d6};
11232 ASSERT_EQUAL_SVE(expected_z2, z2.VnH());
11233
11234 int expected_z3[] = {0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a};
11235 ASSERT_EQUAL_SVE(expected_z3, z3.VnB());
11236
11237 int expected_z4[] = {10502, 10501, 10500, 10499, 10498, 10497, 10496};
11238 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
11239
11240 int expected_z5[] = {0x0783, 0x0784, 0x0785, 0x0786, 0x0787, 0x0788};
11241 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
11242
11243 int expected_z6[] = {0x4e15, 0x4e14, 0x4e13, 0x4e12, 0x4e11, 0x4e10};
11244 ASSERT_EQUAL_SVE(expected_z6, z6.VnH());
11245
11246 int expected_z7[] = {-13510, -9509, -5508, -1507, 2494, 6495, 10496};
11247 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
11248
11249 int expected_z8[] = {0x7755330e, 0x7755330f, 0x77553310, 0x77553311};
11250 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
11251 }
11252}
11253
11254TEST_SVE(sve_int_wide_imm_unpredicated_fdup) {
11255 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11256 START();
11257
11258 // Immediates which can be encoded in the instructions.
11259 __ Fdup(z0.VnH(), RawbitsToFloat16(0xc500));
11260 __ Fdup(z1.VnS(), Float16(2.0));
11261 __ Fdup(z2.VnD(), Float16(3.875));
11262 __ Fdup(z3.VnH(), 8.0f);
11263 __ Fdup(z4.VnS(), -4.75f);
11264 __ Fdup(z5.VnD(), 0.5f);
11265 __ Fdup(z6.VnH(), 1.0);
11266 __ Fdup(z7.VnS(), 2.125);
11267 __ Fdup(z8.VnD(), -13.0);
11268
11269 // Immediates which cannot be encoded in the instructions.
11270 __ Fdup(z10.VnH(), Float16(0.0));
11271 __ Fdup(z11.VnH(), kFP16PositiveInfinity);
11272 __ Fdup(z12.VnS(), 255.0f);
11273 __ Fdup(z13.VnS(), kFP32NegativeInfinity);
11274 __ Fdup(z14.VnD(), 12.3456);
11275 __ Fdup(z15.VnD(), kFP64PositiveInfinity);
11276
11277 END();
11278
11279 if (CAN_RUN()) {
11280 RUN();
11281
11282 ASSERT_EQUAL_SVE(0xc500, z0.VnH());
11283 ASSERT_EQUAL_SVE(0x40000000, z1.VnS());
11284 ASSERT_EQUAL_SVE(0x400f000000000000, z2.VnD());
11285 ASSERT_EQUAL_SVE(0x4800, z3.VnH());
11286 ASSERT_EQUAL_SVE(FloatToRawbits(-4.75f), z4.VnS());
11287 ASSERT_EQUAL_SVE(DoubleToRawbits(0.5), z5.VnD());
11288 ASSERT_EQUAL_SVE(0x3c00, z6.VnH());
11289 ASSERT_EQUAL_SVE(FloatToRawbits(2.125f), z7.VnS());
11290 ASSERT_EQUAL_SVE(DoubleToRawbits(-13.0), z8.VnD());
11291
11292 ASSERT_EQUAL_SVE(0x0000, z10.VnH());
11293 ASSERT_EQUAL_SVE(Float16ToRawbits(kFP16PositiveInfinity), z11.VnH());
11294 ASSERT_EQUAL_SVE(FloatToRawbits(255.0), z12.VnS());
11295 ASSERT_EQUAL_SVE(FloatToRawbits(kFP32NegativeInfinity), z13.VnS());
11296 ASSERT_EQUAL_SVE(DoubleToRawbits(12.3456), z14.VnD());
11297 ASSERT_EQUAL_SVE(DoubleToRawbits(kFP64PositiveInfinity), z15.VnD());
11298 }
11299}
11300
TatWai Chong6f111bc2019-10-07 09:20:37 +010011301TEST_SVE(sve_andv_eorv_orv) {
11302 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11303 START();
11304
11305 uint64_t in[] = {0x8899aabbccddeeff, 0x7777555533331111, 0x123456789abcdef0};
11306 InsrHelper(&masm, z31.VnD(), in);
11307
11308 // For simplicity, we re-use the same pg for various lane sizes.
11309 // For D lanes: 1, 1, 0
11310 // For S lanes: 1, 1, 1, 0, 0
11311 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
11312 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11313 Initialise(&masm, p0.VnB(), pg_in);
11314
11315 // Make a copy so we can check that constructive operations preserve zn.
11316 __ Mov(z0, z31);
11317 __ Andv(b0, p0, z0.VnB()); // destructive
11318 __ Andv(h1, p0, z31.VnH());
11319 __ Mov(z2, z31);
11320 __ Andv(s2, p0, z2.VnS()); // destructive
11321 __ Andv(d3, p0, z31.VnD());
11322
11323 __ Eorv(b4, p0, z31.VnB());
11324 __ Mov(z5, z31);
11325 __ Eorv(h5, p0, z5.VnH()); // destructive
11326 __ Eorv(s6, p0, z31.VnS());
11327 __ Mov(z7, z31);
11328 __ Eorv(d7, p0, z7.VnD()); // destructive
11329
11330 __ Mov(z8, z31);
11331 __ Orv(b8, p0, z8.VnB()); // destructive
11332 __ Orv(h9, p0, z31.VnH());
11333 __ Mov(z10, z31);
11334 __ Orv(s10, p0, z10.VnS()); // destructive
11335 __ Orv(d11, p0, z31.VnD());
11336
11337 END();
11338
11339 if (CAN_RUN()) {
11340 RUN();
11341
11342 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11343 ASSERT_EQUAL_64(0x10, d0);
11344 ASSERT_EQUAL_64(0x1010, d1);
11345 ASSERT_EQUAL_64(0x33331111, d2);
11346 ASSERT_EQUAL_64(0x7777555533331111, d3);
11347 ASSERT_EQUAL_64(0xbf, d4);
11348 ASSERT_EQUAL_64(0xedcb, d5);
11349 ASSERT_EQUAL_64(0x44444444, d6);
11350 ASSERT_EQUAL_64(0x7777555533331111, d7);
11351 ASSERT_EQUAL_64(0xff, d8);
11352 ASSERT_EQUAL_64(0xffff, d9);
11353 ASSERT_EQUAL_64(0x77775555, d10);
11354 ASSERT_EQUAL_64(0x7777555533331111, d11);
11355 } else {
11356 ASSERT_EQUAL_64(0, d0);
11357 ASSERT_EQUAL_64(0x0010, d1);
11358 ASSERT_EQUAL_64(0x00110011, d2);
11359 ASSERT_EQUAL_64(0x0011001100110011, d3);
11360 ASSERT_EQUAL_64(0x62, d4);
11361 ASSERT_EQUAL_64(0x0334, d5);
11362 ASSERT_EQUAL_64(0x8899aabb, d6);
11363 ASSERT_EQUAL_64(0xffeeffeeffeeffee, d7);
11364 ASSERT_EQUAL_64(0xff, d8);
11365 ASSERT_EQUAL_64(0xffff, d9);
11366 ASSERT_EQUAL_64(0xffffffff, d10);
11367 ASSERT_EQUAL_64(0xffffffffffffffff, d11);
11368 }
11369
11370 // Check the upper lanes above the top of the V register are all clear.
11371 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11372 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11373 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11374 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11375 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11376 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11377 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11378 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11379 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11380 ASSERT_EQUAL_SVE_LANE(0, z8.VnD(), i);
11381 ASSERT_EQUAL_SVE_LANE(0, z9.VnD(), i);
11382 ASSERT_EQUAL_SVE_LANE(0, z10.VnD(), i);
11383 ASSERT_EQUAL_SVE_LANE(0, z11.VnD(), i);
11384 }
11385 }
11386}
11387
TatWai Chongb2d8d1f2019-10-21 15:19:31 -070011388
11389TEST_SVE(sve_saddv_uaddv) {
11390 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11391 START();
11392
11393 uint64_t in[] = {0x8899aabbccddeeff, 0x8182838485868788, 0x0807060504030201};
11394 InsrHelper(&masm, z31.VnD(), in);
11395
11396 // For simplicity, we re-use the same pg for various lane sizes.
11397 // For D lanes: 1, 1, 0
11398 // For S lanes: 1, 1, 1, 0, 0
11399 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
11400 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11401 Initialise(&masm, p0.VnB(), pg_in);
11402
11403 // Make a copy so we can check that constructive operations preserve zn.
11404 __ Mov(z0, z31);
11405 __ Saddv(b0, p0, z0.VnB()); // destructive
11406 __ Saddv(h1, p0, z31.VnH());
11407 __ Mov(z2, z31);
11408 __ Saddv(s2, p0, z2.VnS()); // destructive
11409
11410 __ Uaddv(b4, p0, z31.VnB());
11411 __ Mov(z5, z31);
11412 __ Uaddv(h5, p0, z5.VnH()); // destructive
11413 __ Uaddv(s6, p0, z31.VnS());
11414 __ Mov(z7, z31);
11415 __ Uaddv(d7, p0, z7.VnD()); // destructive
11416
11417 END();
11418
11419 if (CAN_RUN()) {
11420 RUN();
11421
11422 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11423 // Saddv
11424 ASSERT_EQUAL_64(0xfffffffffffffda9, d0);
11425 ASSERT_EQUAL_64(0xfffffffffffe9495, d1);
11426 ASSERT_EQUAL_64(0xffffffff07090b0c, d2);
11427 // Uaddv
11428 ASSERT_EQUAL_64(0x00000000000002a9, d4);
11429 ASSERT_EQUAL_64(0x0000000000019495, d5);
11430 ASSERT_EQUAL_64(0x0000000107090b0c, d6);
11431 ASSERT_EQUAL_64(0x8182838485868788, d7);
11432 } else {
11433 // Saddv
11434 ASSERT_EQUAL_64(0xfffffffffffffd62, d0);
11435 ASSERT_EQUAL_64(0xfffffffffffe8394, d1);
11436 ASSERT_EQUAL_64(0xfffffffed3e6fa0b, d2);
11437 // Uaddv
11438 ASSERT_EQUAL_64(0x0000000000000562, d4);
11439 ASSERT_EQUAL_64(0x0000000000028394, d5);
11440 ASSERT_EQUAL_64(0x00000001d3e6fa0b, d6);
11441 ASSERT_EQUAL_64(0x0a1c2e4052647687, d7);
11442 }
11443
11444 // Check the upper lanes above the top of the V register are all clear.
11445 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11446 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11447 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11448 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11449 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11450 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11451 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11452 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11453 }
11454 }
11455}
11456
11457
11458TEST_SVE(sve_sminv_uminv) {
11459 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11460 START();
11461
11462 uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11463 InsrHelper(&masm, z31.VnD(), in);
11464
11465 // For simplicity, we re-use the same pg for various lane sizes.
11466 // For D lanes: 1, 0, 1
11467 // For S lanes: 1, 1, 0, 0, 1
11468 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
11469 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11470 Initialise(&masm, p0.VnB(), pg_in);
11471
11472 // Make a copy so we can check that constructive operations preserve zn.
11473 __ Mov(z0, z31);
11474 __ Sminv(b0, p0, z0.VnB()); // destructive
11475 __ Sminv(h1, p0, z31.VnH());
11476 __ Mov(z2, z31);
11477 __ Sminv(s2, p0, z2.VnS()); // destructive
11478 __ Sminv(d3, p0, z31.VnD());
11479
11480 __ Uminv(b4, p0, z31.VnB());
11481 __ Mov(z5, z31);
11482 __ Uminv(h5, p0, z5.VnH()); // destructive
11483 __ Uminv(s6, p0, z31.VnS());
11484 __ Mov(z7, z31);
11485 __ Uminv(d7, p0, z7.VnD()); // destructive
11486
11487 END();
11488
11489 if (CAN_RUN()) {
11490 RUN();
11491
11492 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11493 // Sminv
11494 ASSERT_EQUAL_64(0xaa, d0);
11495 ASSERT_EQUAL_64(0xaabb, d1);
11496 ASSERT_EQUAL_64(0xaabbfc00, d2);
11497 ASSERT_EQUAL_64(0x00112233aabbfc00, d3); // The smaller lane is inactive.
11498 // Uminv
11499 ASSERT_EQUAL_64(0, d4);
11500 ASSERT_EQUAL_64(0x2233, d5);
11501 ASSERT_EQUAL_64(0x112233, d6);
11502 ASSERT_EQUAL_64(0x00112233aabbfc00, d7); // The smaller lane is inactive.
11503 } else {
11504 // Sminv
11505 ASSERT_EQUAL_64(0xaa, d0);
11506 ASSERT_EQUAL_64(0xaaaa, d1);
11507 ASSERT_EQUAL_64(0xaaaaaaaa, d2);
11508 ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d3);
11509 // Uminv
11510 ASSERT_EQUAL_64(0, d4);
11511 ASSERT_EQUAL_64(0x2233, d5);
11512 ASSERT_EQUAL_64(0x112233, d6);
11513 ASSERT_EQUAL_64(0x00112233aabbfc00, d7);
11514 }
11515
11516 // Check the upper lanes above the top of the V register are all clear.
11517 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11518 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11519 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11520 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11521 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11522 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11523 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11524 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11525 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11526 }
11527 }
11528}
11529
11530TEST_SVE(sve_smaxv_umaxv) {
11531 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11532 START();
11533
11534 uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11535 InsrHelper(&masm, z31.VnD(), in);
11536
11537 // For simplicity, we re-use the same pg for various lane sizes.
11538 // For D lanes: 1, 0, 1
11539 // For S lanes: 1, 1, 0, 0, 1
11540 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
11541 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11542 Initialise(&masm, p0.VnB(), pg_in);
11543
11544 // Make a copy so we can check that constructive operations preserve zn.
11545 __ Mov(z0, z31);
11546 __ Smaxv(b0, p0, z0.VnB()); // destructive
11547 __ Smaxv(h1, p0, z31.VnH());
11548 __ Mov(z2, z31);
11549 __ Smaxv(s2, p0, z2.VnS()); // destructive
11550 __ Smaxv(d3, p0, z31.VnD());
11551
11552 __ Umaxv(b4, p0, z31.VnB());
11553 __ Mov(z5, z31);
11554 __ Umaxv(h5, p0, z5.VnH()); // destructive
11555 __ Umaxv(s6, p0, z31.VnS());
11556 __ Mov(z7, z31);
11557 __ Umaxv(d7, p0, z7.VnD()); // destructive
11558
11559 END();
11560
11561 if (CAN_RUN()) {
11562 RUN();
11563
11564 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11565 // Smaxv
11566 ASSERT_EQUAL_64(0x33, d0);
11567 ASSERT_EQUAL_64(0x44aa, d1);
11568 ASSERT_EQUAL_64(0x112233, d2);
11569 ASSERT_EQUAL_64(0x112233aabbfc00, d3);
11570 // Umaxv
11571 ASSERT_EQUAL_64(0xfe, d4);
11572 ASSERT_EQUAL_64(0xfc00, d5);
11573 ASSERT_EQUAL_64(0xaabbfc00, d6);
11574 ASSERT_EQUAL_64(0x112233aabbfc00, d7);
11575 } else {
11576 // Smaxv
11577 ASSERT_EQUAL_64(0x33, d0);
11578 ASSERT_EQUAL_64(0x44aa, d1);
11579 ASSERT_EQUAL_64(0x112233, d2);
11580 ASSERT_EQUAL_64(0x00112233aabbfc00, d3);
11581 // Umaxv
11582 ASSERT_EQUAL_64(0xfe, d4);
11583 ASSERT_EQUAL_64(0xfc00, d5);
11584 ASSERT_EQUAL_64(0xaabbfc00, d6);
11585 ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d7);
11586 }
11587
11588 // Check the upper lanes above the top of the V register are all clear.
11589 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11590 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11591 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11592 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11593 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11594 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11595 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11596 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11597 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11598 }
11599 }
11600}
11601
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011602template <typename T, size_t M, size_t N>
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011603static void SdotUdotHelper(Test* config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011604 unsigned lane_size_in_bits,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011605 const T (&zd_inputs)[M],
11606 const T (&za_inputs)[M],
11607 const T (&zn_inputs)[N],
11608 const T (&zm_inputs)[N],
11609 const T (&zd_expected)[M],
11610 const T (&zdnm_expected)[M],
11611 bool is_signed,
11612 int index = -1) {
11613 VIXL_STATIC_ASSERT(N == (M * 4));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011614 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11615 START();
11616
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011617 auto dot_fn = [&](const ZRegister& zd,
11618 const ZRegister& za,
11619 const ZRegister& zn,
11620 const ZRegister& zm,
11621 bool is_signed,
11622 int index) {
11623 if (is_signed) {
11624 if (index < 0) {
11625 __ Sdot(zd, za, zn, zm);
11626 } else {
11627 __ Sdot(zd, za, zn, zm, index);
11628 }
11629 } else {
11630 if (index < 0) {
11631 __ Udot(zd, za, zn, zm);
11632 } else {
11633 __ Udot(zd, za, zn, zm, index);
11634 }
11635 }
11636 };
11637
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011638 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
11639 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
11640 ZRegister zn = z2.WithLaneSize(lane_size_in_bits / 4);
11641 ZRegister zm = z3.WithLaneSize(lane_size_in_bits / 4);
11642
11643 InsrHelper(&masm, zd, zd_inputs);
11644 InsrHelper(&masm, za, za_inputs);
11645 InsrHelper(&masm, zn, zn_inputs);
11646 InsrHelper(&masm, zm, zm_inputs);
11647
11648 // The Dot macro handles arbitrarily-aliased registers in the argument list.
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011649 ZRegister dm_result = z4.WithLaneSize(lane_size_in_bits);
11650 ZRegister dnm_result = z5.WithLaneSize(lane_size_in_bits);
11651 ZRegister da_result = z6.WithLaneSize(lane_size_in_bits);
11652 ZRegister dn_result = z7.WithLaneSize(lane_size_in_bits);
11653 ZRegister d_result = z8.WithLaneSize(lane_size_in_bits);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011654
11655 __ Mov(da_result, za);
11656 // zda = zda + (zn . zm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011657 dot_fn(da_result, da_result, zn, zm, is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011658
TatWai Chong50ef1712020-06-19 05:47:44 -070011659 __ Mov(dn_result, zn.WithSameLaneSizeAs(dn_result));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011660 // zdn = za + (zdn . zm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011661 dot_fn(dn_result, za, dn_result.WithSameLaneSizeAs(zn), zm, is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011662
TatWai Chong50ef1712020-06-19 05:47:44 -070011663 __ Mov(dm_result, zm.WithSameLaneSizeAs(dm_result));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011664 // zdm = za + (zn . zdm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011665 dot_fn(dm_result, za, zn, dm_result.WithSameLaneSizeAs(zm), is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011666
11667 __ Mov(d_result, zd);
11668 // zd = za + (zn . zm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011669 dot_fn(d_result, za, zn, zm, is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011670
TatWai Chong50ef1712020-06-19 05:47:44 -070011671 __ Mov(dnm_result, zn.WithSameLaneSizeAs(dnm_result));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011672 // zdnm = za + (zdmn . zdnm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011673 dot_fn(dnm_result,
11674 za,
11675 dnm_result.WithSameLaneSizeAs(zn),
11676 dnm_result.WithSameLaneSizeAs(zm),
11677 is_signed,
11678 index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011679
11680 END();
11681
11682 if (CAN_RUN()) {
11683 RUN();
11684
11685 ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
11686 ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits / 4));
11687 ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits / 4));
11688
11689 ASSERT_EQUAL_SVE(zd_expected, da_result);
11690 ASSERT_EQUAL_SVE(zd_expected, dn_result);
11691 ASSERT_EQUAL_SVE(zd_expected, dm_result);
11692 ASSERT_EQUAL_SVE(zd_expected, d_result);
11693
11694 ASSERT_EQUAL_SVE(zdnm_expected, dnm_result);
11695 }
11696}
11697
11698TEST_SVE(sve_sdot) {
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011699 int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11700 int64_t za_inputs[] = {INT32_MAX, -3, 2};
11701 int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11702 int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011703
11704 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011705 int64_t zd_expected_s[] = {-2147418113, -183, 133}; // 0x8000ffff
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011706 int64_t zd_expected_d[] = {2147549183, -183, 133}; // 0x8000ffff
11707
11708 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011709 int64_t zdnm_expected_s[] = {-2147418113, 980, 572};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011710 int64_t zdnm_expected_d[] = {2147549183, 980, 572};
11711
11712 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011713 kSRegSize,
11714 zd_inputs,
11715 za_inputs,
11716 zn_inputs,
11717 zm_inputs,
11718 zd_expected_s,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011719 zdnm_expected_s,
11720 true);
11721
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011722 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011723 kDRegSize,
11724 zd_inputs,
11725 za_inputs,
11726 zn_inputs,
11727 zm_inputs,
11728 zd_expected_d,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011729 zdnm_expected_d,
11730 true);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011731}
11732
11733TEST_SVE(sve_udot) {
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011734 int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11735 int64_t za_inputs[] = {INT32_MAX, -3, 2};
11736 int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11737 int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011738
11739 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011740 int64_t zd_expected_s[] = {0x8000ffff, 0x00001749, 0x0000f085};
11741 int64_t zd_expected_d[] = {0x000000047c00ffff,
11742 0x000000000017ff49,
11743 0x00000000fff00085};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011744
11745 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011746 int64_t zdnm_expected_s[] = {0x8000ffff, 0x000101d4, 0x0001d03c};
11747 int64_t zdnm_expected_d[] = {0x000000047c00ffff,
11748 0x00000000fffe03d4,
11749 0x00000001ffce023c};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011750
11751 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011752 kSRegSize,
11753 zd_inputs,
11754 za_inputs,
11755 zn_inputs,
11756 zm_inputs,
11757 zd_expected_s,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011758 zdnm_expected_s,
11759 false);
11760
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011761 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011762 kDRegSize,
11763 zd_inputs,
11764 za_inputs,
11765 zn_inputs,
11766 zm_inputs,
11767 zd_expected_d,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011768 zdnm_expected_d,
11769 false);
11770}
11771
11772TEST_SVE(sve_sdot_indexed_s) {
11773 int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11774 int64_t za_inputs[] = {0, 1, 2, 3};
11775 int64_t zn_inputs[] =
11776 {-1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4};
11777 int64_t zm_inputs[] =
11778 {127, 127, 127, 127, -128, -128, -128, -128, -1, -1, -1, -1, 0, 0, 0, 0};
11779
11780 constexpr int s = kQRegSize / kSRegSize;
11781
11782 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11783 int64_t zd_expected_s[][s] = {{0, 1, 2, 3}, // Generated from zm[0]
11784 {4, 9, 14, 19},
11785 {512, 1025, 1538, 2051},
11786 {-508, -1015, -1522, -2029}};
11787
11788 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11789 int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11790 {12, 25, 38, 51},
11791 {8, 17, 26, 35},
11792 {4, 9, 14, 19}};
11793
11794 for (unsigned i = 0; i < s; i++) {
11795 SdotUdotHelper(config,
11796 kSRegSize,
11797 zd_inputs,
11798 za_inputs,
11799 zn_inputs,
11800 zm_inputs,
11801 zd_expected_s[i],
11802 zdnm_expected_s[i],
11803 true,
11804 i);
11805 }
11806}
11807
11808TEST_SVE(sve_sdot_indexed_d) {
11809 int64_t zd_inputs[] = {0xff, 0xff};
11810 int64_t za_inputs[] = {0, 1};
11811 int64_t zn_inputs[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11812 int64_t zm_inputs[] = {-128, -128, -128, -128, 127, 127, 127, 127};
11813
11814 constexpr int d = kQRegSize / kDRegSize;
11815
11816 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11817 int64_t zd_expected_d[][d] = {{-508, -507}, // Generated from zm[0]
11818 {512, 513}};
11819
11820 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11821 int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11822
11823 for (unsigned i = 0; i < d; i++) {
11824 SdotUdotHelper(config,
11825 kDRegSize,
11826 zd_inputs,
11827 za_inputs,
11828 zn_inputs,
11829 zm_inputs,
11830 zd_expected_d[i],
11831 zdnm_expected_d[i],
11832 true,
11833 i);
11834 }
11835}
11836
11837TEST_SVE(sve_udot_indexed_s) {
11838 int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11839 int64_t za_inputs[] = {0, 1, 2, 3};
11840 int64_t zn_inputs[] = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4};
11841 int64_t zm_inputs[] =
11842 {127, 127, 127, 127, 255, 255, 255, 255, 1, 1, 1, 1, 0, 0, 0, 0};
11843
11844 constexpr int s = kQRegSize / kSRegSize;
11845
11846 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11847 int64_t zd_expected_s[][s] = {{0, 1, 2, 3},
11848 {4, 9, 14, 19},
11849 {1020, 2041, 3062, 4083},
11850 {508, 1017, 1526, 2035}};
11851
11852 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11853 int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11854 {12, 25, 38, 51},
11855 {8, 17, 26, 35},
11856 {4, 9, 14, 19}};
11857
11858 for (unsigned i = 0; i < s; i++) {
11859 SdotUdotHelper(config,
11860 kSRegSize,
11861 zd_inputs,
11862 za_inputs,
11863 zn_inputs,
11864 zm_inputs,
11865 zd_expected_s[i],
11866 zdnm_expected_s[i],
11867 false,
11868 i);
11869 }
11870}
11871
11872TEST_SVE(sve_udot_indexed_d) {
11873 int64_t zd_inputs[] = {0xff, 0xff};
11874 int64_t za_inputs[] = {0, 1};
11875 int64_t zn_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1};
11876 int64_t zm_inputs[] = {255, 255, 255, 255, 127, 127, 127, 127};
11877
11878 constexpr int d = kQRegSize / kDRegSize;
11879
11880 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11881 int64_t zd_expected_d[][d] = {{508, 509}, {1020, 1021}};
11882
11883 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11884 int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11885
11886 for (unsigned i = 0; i < d; i++) {
11887 SdotUdotHelper(config,
11888 kDRegSize,
11889 zd_inputs,
11890 za_inputs,
11891 zn_inputs,
11892 zm_inputs,
11893 zd_expected_d[i],
11894 zdnm_expected_d[i],
11895 false,
11896 i);
11897 }
11898}
11899
11900static void IntSegmentPatternHelper(MacroAssembler* masm,
11901 const ZRegister& dst,
11902 const ZRegister& src) {
11903 VIXL_ASSERT(AreSameLaneSize(dst, src));
11904 UseScratchRegisterScope temps(masm);
11905 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
11906 masm->Index(ztmp, 0, 1);
11907 masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
11908 masm->Add(dst, src, ztmp);
11909}
11910
11911TEST_SVE(sve_sdot_udot_indexed_s) {
11912 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11913 START();
11914
11915 const int multiplier = 2;
11916 __ Dup(z9.VnS(), multiplier);
11917
11918 __ Ptrue(p0.VnB());
11919 __ Index(z29.VnS(), 4, 1);
11920
11921 // z29 = [... 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]
11922 __ And(z29.VnS(), z29.VnS(), 3);
11923
11924 // p7 = [... 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
11925 __ Cmple(p7.VnS(), p0.Zeroing(), z29.VnS(), 0);
11926
11927 // p6 = [... 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
11928 __ Cmple(p6.VnS(), p0.Zeroing(), z29.VnS(), 1);
11929
11930 // p5 = [... 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]
11931 __ Cmple(p5.VnS(), p0.Zeroing(), z29.VnS(), 2);
11932
11933 __ Index(z28.VnB(), 1, 1);
11934 __ Dup(z27.VnS(), z28.VnS(), 0);
11935
11936 // z27 = [... 3, 2, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1]
11937 IntSegmentPatternHelper(&masm, z27.VnB(), z27.VnB());
11938
11939 // z27 = [... 6, 4, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2]
11940 __ Mul(z27.VnS(), p7.Merging(), z27.VnS(), z9.VnS());
11941
11942 // z27 = [... 12, 8, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4]
11943 __ Mul(z27.VnS(), p6.Merging(), z27.VnS(), z9.VnS());
11944
11945 // 2nd segment | 1st segment |
11946 // v v
11947 // z27 = [... 24, 16, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4, 32, 24, 16, 8]
11948 __ Mul(z27.VnS(), p5.Merging(), z27.VnS(), z9.VnS());
11949
11950 __ Dup(z0.VnS(), 0);
11951 __ Dup(z1.VnS(), 0);
11952 __ Dup(z2.VnS(), 0);
11953 __ Dup(z3.VnS(), 0);
11954 __ Dup(z4.VnS(), 0);
11955 __ Dup(z5.VnS(), 0);
11956
11957 // Skip the lanes starting from the 129th lane since the value of these lanes
11958 // are overflow after the number sequence creation by `index`.
11959 __ Cmpls(p3.VnB(), p0.Zeroing(), z28.VnB(), 128);
11960 __ Mov(z0.VnB(), p3.Merging(), z27.VnB());
11961 __ Mov(z1.VnB(), p3.Merging(), z28.VnB());
11962
11963 __ Dup(z2.VnS(), 0);
11964 __ Dup(z3.VnS(), 0);
11965 __ Dup(z4.VnS(), 0);
11966 __ Dup(z5.VnS(), 0);
11967
11968 __ Udot(z2.VnS(), z2.VnS(), z1.VnB(), z0.VnB(), 0);
11969
11970 __ Udot(z3.VnS(), z3.VnS(), z1.VnB(), z0.VnB(), 1);
11971 __ Mul(z3.VnS(), z3.VnS(), 2);
11972
11973 __ Udot(z4.VnS(), z4.VnS(), z1.VnB(), z0.VnB(), 2);
11974 __ Mul(z4.VnS(), z4.VnS(), 4);
11975
11976 __ Udot(z5.VnS(), z5.VnS(), z1.VnB(), z0.VnB(), 3);
11977 __ Mul(z5.VnS(), z5.VnS(), 8);
11978
11979 __ Dup(z7.VnS(), 0);
11980 __ Dup(z8.VnS(), 0);
11981 __ Dup(z9.VnS(), 0);
11982 __ Dup(z10.VnS(), 0);
11983
11984 // Negate the all positive vector for testing signed dot.
11985 __ Neg(z6.VnB(), p0.Merging(), z0.VnB());
11986 __ Sdot(z7.VnS(), z7.VnS(), z1.VnB(), z6.VnB(), 0);
11987
11988 __ Sdot(z8.VnS(), z8.VnS(), z1.VnB(), z6.VnB(), 1);
11989 __ Mul(z8.VnS(), z8.VnS(), 2);
11990
11991 __ Sdot(z9.VnS(), z9.VnS(), z1.VnB(), z6.VnB(), 2);
11992 __ Mul(z9.VnS(), z9.VnS(), 4);
11993
11994 __ Sdot(z10.VnS(), z10.VnS(), z1.VnB(), z6.VnB(), 3);
11995 __ Mul(z10.VnS(), z10.VnS(), 8);
11996
11997 END();
11998
11999 if (CAN_RUN()) {
12000 RUN();
12001
12002 // Only compare the first 128-bit segment of destination register, use
12003 // another result from generated instructions to check the remaining part.
12004 // s_lane[0] = (1 * 8) + (2 * 16) + (3 * 24) + (4 * 32) = 240
12005 // ...
12006 // s_lane[3] = (13 * 8) + (14 * 16) + (15 * 24) + (16 * 32) = 1200
12007 int udot_expected[] = {1200, 880, 560, 240};
12008 ASSERT_EQUAL_SVE(udot_expected, z2.VnS());
12009 ASSERT_EQUAL_SVE(z2.VnS(), z3.VnS());
12010 ASSERT_EQUAL_SVE(z2.VnS(), z4.VnS());
12011 ASSERT_EQUAL_SVE(z2.VnS(), z5.VnS());
12012
12013 int sdot_expected[] = {-1200, -880, -560, -240};
12014 ASSERT_EQUAL_SVE(sdot_expected, z7.VnS());
12015 ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
12016 ASSERT_EQUAL_SVE(z7.VnS(), z9.VnS());
12017 ASSERT_EQUAL_SVE(z7.VnS(), z10.VnS());
12018 }
12019}
12020
12021TEST_SVE(sve_sdot_udot_indexed_d) {
12022 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12023 START();
12024
12025 const int multiplier = 2;
12026 __ Dup(z9.VnD(), multiplier);
12027
12028 __ Ptrue(p0.VnD());
12029 __ Pfalse(p1.VnD());
12030
12031 // p2 = [..., 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
12032 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
12033
12034 __ Index(z1.VnH(), 1, 1);
12035 __ Dup(z0.VnD(), z1.VnD(), 0);
12036
12037 // z0 = [... 5, 4, 3, 2, 5, 4, 3, 2, 4, 3, 2, 1, 4, 3, 2, 1]
12038 IntSegmentPatternHelper(&masm, z0.VnH(), z0.VnH());
12039
12040 // 2nd segment | 1st segment |
12041 // v v
12042 // z0 = [... 5, 4, 3, 2, 10, 8, 6, 4, 4, 3, 2, 1, 8, 6, 4, 2]
12043 __ Mul(z0.VnD(), p2.Merging(), z0.VnD(), z9.VnD());
12044
12045 __ Dup(z3.VnD(), 0);
12046 __ Dup(z4.VnD(), 0);
12047
12048 __ Udot(z3.VnD(), z3.VnD(), z1.VnH(), z0.VnH(), 0);
12049
12050 __ Udot(z4.VnD(), z4.VnD(), z1.VnH(), z0.VnH(), 1);
12051 __ Mul(z4.VnD(), z4.VnD(), multiplier);
12052
12053 __ Dup(z12.VnD(), 0);
12054 __ Dup(z13.VnD(), 0);
12055
12056 __ Ptrue(p4.VnH());
12057 __ Neg(z10.VnH(), p4.Merging(), z0.VnH());
12058
12059 __ Sdot(z12.VnD(), z12.VnD(), z1.VnH(), z10.VnH(), 0);
12060
12061 __ Sdot(z13.VnD(), z13.VnD(), z1.VnH(), z10.VnH(), 1);
12062 __ Mul(z13.VnD(), z13.VnD(), multiplier);
12063
12064 END();
12065
12066 if (CAN_RUN()) {
12067 RUN();
12068
12069 // Only compare the first 128-bit segment of destination register, use
12070 // another result from generated instructions to check the remaining part.
12071 // d_lane[0] = (1 * 2) + (2 * 4) + (3 * 6) + (4 * 8) = 60
12072 // d_lane[1] = (5 * 2) + (6 * 4) + (7 * 6) + (8 * 8) = 140
12073 uint64_t udot_expected[] = {416, 304, 140, 60};
12074 ASSERT_EQUAL_SVE(udot_expected, z3.VnD());
12075 ASSERT_EQUAL_SVE(z3.VnD(), z4.VnD());
12076
12077 int64_t sdot_expected[] = {-416, -304, -140, -60};
12078 ASSERT_EQUAL_SVE(sdot_expected, z12.VnD());
12079 ASSERT_EQUAL_SVE(z12.VnD(), z13.VnD());
12080 }
TatWai Chong4d2a4e92019-10-23 16:19:32 -070012081}
12082
TatWai Chong7a0d3672019-10-23 17:35:18 -070012083template <typename T, size_t N>
12084static void FPToRawbitsWithSize(const T (&inputs)[N],
12085 uint64_t* outputs,
12086 unsigned size_in_bits) {
TatWai Chongfe536042019-10-23 16:34:11 -070012087 for (size_t i = 0; i < N; i++) {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012088 outputs[i] = vixl::FPToRawbitsWithSize(size_in_bits, inputs[i]);
TatWai Chongfe536042019-10-23 16:34:11 -070012089 }
12090}
12091
TatWai Chong7a0d3672019-10-23 17:35:18 -070012092template <typename Ti, typename Te, size_t N>
12093static void FPBinArithHelper(Test* config,
12094 ArithFn macro,
12095 int lane_size_in_bits,
12096 const Ti (&zn_inputs)[N],
12097 const Ti (&zm_inputs)[N],
12098 const Te (&zd_expected)[N]) {
TatWai Chongfe536042019-10-23 16:34:11 -070012099 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chong7a0d3672019-10-23 17:35:18 -070012100
TatWai Chongfe536042019-10-23 16:34:11 -070012101 START();
12102
12103 ZRegister zd = z29.WithLaneSize(lane_size_in_bits);
12104 ZRegister zn = z30.WithLaneSize(lane_size_in_bits);
12105 ZRegister zm = z31.WithLaneSize(lane_size_in_bits);
12106
12107 uint64_t zn_rawbits[N];
12108 uint64_t zm_rawbits[N];
12109
TatWai Chong7a0d3672019-10-23 17:35:18 -070012110 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
12111 FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
TatWai Chongfe536042019-10-23 16:34:11 -070012112
12113 InsrHelper(&masm, zn, zn_rawbits);
12114 InsrHelper(&masm, zm, zm_rawbits);
12115
12116 (masm.*macro)(zd, zn, zm);
12117
12118 END();
12119
12120 if (CAN_RUN()) {
12121 RUN();
12122
12123 ASSERT_EQUAL_SVE(zd_expected, zd);
12124 }
12125}
12126
12127TEST_SVE(sve_fp_arithmetic_unpredicated_fadd) {
12128 double zn_inputs[] = {24.0,
12129 5.5,
12130 0.0,
12131 3.875,
12132 2.125,
12133 kFP64PositiveInfinity,
12134 kFP64NegativeInfinity};
12135
12136 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12137
TatWai Chong7a0d3672019-10-23 17:35:18 -070012138 ArithFn fn = &MacroAssembler::Fadd;
TatWai Chongfe536042019-10-23 16:34:11 -070012139
12140 uint16_t expected_h[] = {Float16ToRawbits(Float16(1048.0)),
12141 Float16ToRawbits(Float16(2053.5)),
12142 Float16ToRawbits(Float16(0.1)),
12143 Float16ToRawbits(Float16(-0.875)),
12144 Float16ToRawbits(Float16(14.465)),
12145 Float16ToRawbits(kFP16PositiveInfinity),
12146 Float16ToRawbits(kFP16NegativeInfinity)};
12147
TatWai Chong7a0d3672019-10-23 17:35:18 -070012148 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
TatWai Chongfe536042019-10-23 16:34:11 -070012149
12150 uint32_t expected_s[] = {FloatToRawbits(1048.0f),
12151 FloatToRawbits(2053.5f),
12152 FloatToRawbits(0.1f),
12153 FloatToRawbits(-0.875f),
12154 FloatToRawbits(14.465f),
12155 FloatToRawbits(kFP32PositiveInfinity),
12156 FloatToRawbits(kFP32NegativeInfinity)};
12157
TatWai Chong7a0d3672019-10-23 17:35:18 -070012158 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
TatWai Chongfe536042019-10-23 16:34:11 -070012159
12160 uint64_t expected_d[] = {DoubleToRawbits(1048.0),
12161 DoubleToRawbits(2053.5),
12162 DoubleToRawbits(0.1),
12163 DoubleToRawbits(-0.875),
12164 DoubleToRawbits(14.465),
12165 DoubleToRawbits(kFP64PositiveInfinity),
12166 DoubleToRawbits(kFP64NegativeInfinity)};
12167
TatWai Chong7a0d3672019-10-23 17:35:18 -070012168 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
TatWai Chongfe536042019-10-23 16:34:11 -070012169}
12170
12171TEST_SVE(sve_fp_arithmetic_unpredicated_fsub) {
12172 double zn_inputs[] = {24.0,
12173 5.5,
12174 0.0,
12175 3.875,
12176 2.125,
12177 kFP64PositiveInfinity,
12178 kFP64NegativeInfinity};
12179
12180 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12181
TatWai Chong7a0d3672019-10-23 17:35:18 -070012182 ArithFn fn = &MacroAssembler::Fsub;
TatWai Chongfe536042019-10-23 16:34:11 -070012183
12184 uint16_t expected_h[] = {Float16ToRawbits(Float16(-1000.0)),
12185 Float16ToRawbits(Float16(-2042.5)),
12186 Float16ToRawbits(Float16(-0.1)),
12187 Float16ToRawbits(Float16(8.625)),
12188 Float16ToRawbits(Float16(-10.215)),
12189 Float16ToRawbits(kFP16PositiveInfinity),
12190 Float16ToRawbits(kFP16NegativeInfinity)};
12191
TatWai Chong7a0d3672019-10-23 17:35:18 -070012192 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
TatWai Chongfe536042019-10-23 16:34:11 -070012193
12194 uint32_t expected_s[] = {FloatToRawbits(-1000.0),
12195 FloatToRawbits(-2042.5),
12196 FloatToRawbits(-0.1),
12197 FloatToRawbits(8.625),
12198 FloatToRawbits(-10.215),
12199 FloatToRawbits(kFP32PositiveInfinity),
12200 FloatToRawbits(kFP32NegativeInfinity)};
12201
TatWai Chong7a0d3672019-10-23 17:35:18 -070012202 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
TatWai Chongfe536042019-10-23 16:34:11 -070012203
12204 uint64_t expected_d[] = {DoubleToRawbits(-1000.0),
12205 DoubleToRawbits(-2042.5),
12206 DoubleToRawbits(-0.1),
12207 DoubleToRawbits(8.625),
12208 DoubleToRawbits(-10.215),
12209 DoubleToRawbits(kFP64PositiveInfinity),
12210 DoubleToRawbits(kFP64NegativeInfinity)};
12211
TatWai Chong7a0d3672019-10-23 17:35:18 -070012212 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
TatWai Chongfe536042019-10-23 16:34:11 -070012213}
12214
12215TEST_SVE(sve_fp_arithmetic_unpredicated_fmul) {
12216 double zn_inputs[] = {24.0,
12217 5.5,
12218 0.0,
12219 3.875,
12220 2.125,
12221 kFP64PositiveInfinity,
12222 kFP64NegativeInfinity};
12223
12224 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12225
TatWai Chong7a0d3672019-10-23 17:35:18 -070012226 ArithFn fn = &MacroAssembler::Fmul;
TatWai Chongfe536042019-10-23 16:34:11 -070012227
12228 uint16_t expected_h[] = {Float16ToRawbits(Float16(24576.0)),
12229 Float16ToRawbits(Float16(11264.0)),
12230 Float16ToRawbits(Float16(0.0)),
12231 Float16ToRawbits(Float16(-18.4)),
12232 Float16ToRawbits(Float16(26.23)),
12233 Float16ToRawbits(kFP16PositiveInfinity),
12234 Float16ToRawbits(kFP16PositiveInfinity)};
12235
TatWai Chong7a0d3672019-10-23 17:35:18 -070012236 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
TatWai Chongfe536042019-10-23 16:34:11 -070012237
12238 uint32_t expected_s[] = {FloatToRawbits(24576.0),
12239 FloatToRawbits(11264.0),
12240 FloatToRawbits(0.0),
12241 FloatToRawbits(-18.40625),
12242 FloatToRawbits(26.2225),
12243 FloatToRawbits(kFP32PositiveInfinity),
12244 FloatToRawbits(kFP32PositiveInfinity)};
12245
TatWai Chong7a0d3672019-10-23 17:35:18 -070012246 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
TatWai Chongfe536042019-10-23 16:34:11 -070012247
12248 uint64_t expected_d[] = {DoubleToRawbits(24576.0),
12249 DoubleToRawbits(11264.0),
12250 DoubleToRawbits(0.0),
12251 DoubleToRawbits(-18.40625),
12252 DoubleToRawbits(26.2225),
12253 DoubleToRawbits(kFP64PositiveInfinity),
12254 DoubleToRawbits(kFP64PositiveInfinity)};
12255
TatWai Chong7a0d3672019-10-23 17:35:18 -070012256 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
TatWai Chongfe536042019-10-23 16:34:11 -070012257}
12258
TatWai Chong7a0d3672019-10-23 17:35:18 -070012259typedef void (MacroAssembler::*FPArithPredicatedFn)(
12260 const ZRegister& zd,
12261 const PRegisterM& pg,
12262 const ZRegister& zn,
12263 const ZRegister& zm,
12264 FPMacroNaNPropagationOption nan_option);
12265
Martyn Capewell37f28182020-01-14 10:15:10 +000012266typedef void (MacroAssembler::*FPArithPredicatedNoNaNOptFn)(
12267 const ZRegister& zd,
12268 const PRegisterM& pg,
12269 const ZRegister& zn,
12270 const ZRegister& zm);
12271
TatWai Chong7a0d3672019-10-23 17:35:18 -070012272template <typename Ti, typename Te, size_t N>
12273static void FPBinArithHelper(
12274 Test* config,
12275 FPArithPredicatedFn macro,
Martyn Capewell37f28182020-01-14 10:15:10 +000012276 FPArithPredicatedNoNaNOptFn macro_nonan,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012277 unsigned lane_size_in_bits,
12278 const Ti (&zd_inputs)[N],
12279 const int (&pg_inputs)[N],
12280 const Ti (&zn_inputs)[N],
12281 const Ti (&zm_inputs)[N],
12282 const Te (&zd_expected)[N],
12283 FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
Martyn Capewell37f28182020-01-14 10:15:10 +000012284 VIXL_ASSERT((macro == NULL) ^ (macro_nonan == NULL));
TatWai Chongd316c5e2019-10-16 12:22:10 -070012285 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12286 START();
12287
TatWai Chong7a0d3672019-10-23 17:35:18 -070012288 // Avoid choosing default scratch registers.
12289 ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12290 ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12291 ZRegister zm = z28.WithLaneSize(lane_size_in_bits);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012292
TatWai Chong7a0d3672019-10-23 17:35:18 -070012293 uint64_t zn_inputs_rawbits[N];
12294 uint64_t zm_inputs_rawbits[N];
12295 uint64_t zd_inputs_rawbits[N];
TatWai Chongd316c5e2019-10-16 12:22:10 -070012296
TatWai Chong7a0d3672019-10-23 17:35:18 -070012297 FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
12298 FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
12299 FPToRawbitsWithSize(zd_inputs, zd_inputs_rawbits, lane_size_in_bits);
12300
12301 InsrHelper(&masm, zn, zn_inputs_rawbits);
12302 InsrHelper(&masm, zm, zm_inputs_rawbits);
12303 InsrHelper(&masm, zd, zd_inputs_rawbits);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012304
12305 PRegisterWithLaneSize pg = p0.WithLaneSize(lane_size_in_bits);
12306 Initialise(&masm, pg, pg_inputs);
12307
12308 // `instr` zdn, pg, zdn, zm
12309 ZRegister dn_result = z0.WithLaneSize(lane_size_in_bits);
12310 __ Mov(dn_result, zn);
Martyn Capewell37f28182020-01-14 10:15:10 +000012311 if (macro_nonan == NULL) {
12312 (masm.*macro)(dn_result, pg.Merging(), dn_result, zm, nan_option);
12313 } else {
12314 (masm.*macro_nonan)(dn_result, pg.Merging(), dn_result, zm);
12315 }
TatWai Chongd316c5e2019-10-16 12:22:10 -070012316
12317 // Based on whether zd and zm registers are aliased, the macro of instructions
12318 // (`Instr`) swaps the order of operands if it has the commutative property,
12319 // otherwise, transfer to the reversed `Instr`, such as fdivr.
12320 // `instr` zdm, pg, zn, zdm
12321 ZRegister dm_result = z1.WithLaneSize(lane_size_in_bits);
12322 __ Mov(dm_result, zm);
Martyn Capewell37f28182020-01-14 10:15:10 +000012323 if (macro_nonan == NULL) {
12324 (masm.*macro)(dm_result, pg.Merging(), zn, dm_result, nan_option);
12325 } else {
12326 (masm.*macro_nonan)(dm_result, pg.Merging(), zn, dm_result);
12327 }
TatWai Chongd316c5e2019-10-16 12:22:10 -070012328
12329 // The macro of instructions (`Instr`) automatically selects between `instr`
12330 // and movprfx + `instr` based on whether zd and zn registers are aliased.
12331 // A generated movprfx instruction is predicated that using the same
12332 // governing predicate register. In order to keep the result constant,
12333 // initialize the destination register first.
12334 // `instr` zd, pg, zn, zm
12335 ZRegister d_result = z2.WithLaneSize(lane_size_in_bits);
12336 __ Mov(d_result, zd);
Martyn Capewell37f28182020-01-14 10:15:10 +000012337 if (macro_nonan == NULL) {
12338 (masm.*macro)(d_result, pg.Merging(), zn, zm, nan_option);
12339 } else {
12340 (masm.*macro_nonan)(d_result, pg.Merging(), zn, zm);
12341 }
TatWai Chongd316c5e2019-10-16 12:22:10 -070012342
12343 END();
12344
12345 if (CAN_RUN()) {
12346 RUN();
12347
12348 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12349 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12350 if (!core.HasSVELane(dn_result, lane)) break;
12351 if ((pg_inputs[i] & 1) != 0) {
12352 ASSERT_EQUAL_SVE_LANE(zd_expected[i], dn_result, lane);
12353 } else {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012354 ASSERT_EQUAL_SVE_LANE(zn_inputs_rawbits[i], dn_result, lane);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012355 }
12356 }
12357
12358 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12359 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12360 if (!core.HasSVELane(dm_result, lane)) break;
12361 if ((pg_inputs[i] & 1) != 0) {
12362 ASSERT_EQUAL_SVE_LANE(zd_expected[i], dm_result, lane);
12363 } else {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012364 ASSERT_EQUAL_SVE_LANE(zm_inputs_rawbits[i], dm_result, lane);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012365 }
12366 }
12367
12368 ASSERT_EQUAL_SVE(zd_expected, d_result);
12369 }
12370}
12371
12372TEST_SVE(sve_binary_arithmetic_predicated_fdiv) {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012373 // The inputs are shared with different precision tests.
TatWai Chongd316c5e2019-10-16 12:22:10 -070012374 double zd_in[] = {0.1, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9};
12375
12376 double zn_in[] = {24.0,
12377 24.0,
12378 -2.0,
12379 -2.0,
12380 5.5,
12381 5.5,
12382 kFP64PositiveInfinity,
12383 kFP64PositiveInfinity,
12384 kFP64NegativeInfinity,
12385 kFP64NegativeInfinity};
12386
12387 double zm_in[] = {-2.0, -2.0, 24.0, 24.0, 0.5, 0.5, 0.65, 0.65, 24.0, 24.0};
12388
TatWai Chongd316c5e2019-10-16 12:22:10 -070012389 int pg_in[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
12390
TatWai Chong7a0d3672019-10-23 17:35:18 -070012391 uint16_t exp_h[] = {Float16ToRawbits(Float16(0.1)),
TatWai Chongd316c5e2019-10-16 12:22:10 -070012392 Float16ToRawbits(Float16(-12.0)),
12393 Float16ToRawbits(Float16(2.2)),
12394 Float16ToRawbits(Float16(-0.0833)),
12395 Float16ToRawbits(Float16(4.4)),
12396 Float16ToRawbits(Float16(11.0)),
12397 Float16ToRawbits(Float16(6.6)),
12398 Float16ToRawbits(kFP16PositiveInfinity),
12399 Float16ToRawbits(Float16(8.8)),
12400 Float16ToRawbits(kFP16NegativeInfinity)};
12401
TatWai Chong7a0d3672019-10-23 17:35:18 -070012402 FPBinArithHelper(config,
Martyn Capewell37f28182020-01-14 10:15:10 +000012403 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012404 &MacroAssembler::Fdiv,
12405 kHRegSize,
12406 zd_in,
12407 pg_in,
12408 zn_in,
12409 zm_in,
12410 exp_h);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012411
12412 uint32_t exp_s[] = {FloatToRawbits(0.1),
12413 FloatToRawbits(-12.0),
12414 FloatToRawbits(2.2),
12415 0xbdaaaaab,
12416 FloatToRawbits(4.4),
12417 FloatToRawbits(11.0),
12418 FloatToRawbits(6.6),
12419 FloatToRawbits(kFP32PositiveInfinity),
12420 FloatToRawbits(8.8),
12421 FloatToRawbits(kFP32NegativeInfinity)};
12422
TatWai Chong7a0d3672019-10-23 17:35:18 -070012423 FPBinArithHelper(config,
Martyn Capewell37f28182020-01-14 10:15:10 +000012424 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012425 &MacroAssembler::Fdiv,
12426 kSRegSize,
12427 zd_in,
12428 pg_in,
12429 zn_in,
12430 zm_in,
12431 exp_s);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012432
12433 uint64_t exp_d[] = {DoubleToRawbits(0.1),
12434 DoubleToRawbits(-12.0),
12435 DoubleToRawbits(2.2),
12436 0xbfb5555555555555,
12437 DoubleToRawbits(4.4),
12438 DoubleToRawbits(11.0),
12439 DoubleToRawbits(6.6),
12440 DoubleToRawbits(kFP64PositiveInfinity),
12441 DoubleToRawbits(8.8),
12442 DoubleToRawbits(kFP64NegativeInfinity)};
12443
TatWai Chong7a0d3672019-10-23 17:35:18 -070012444 FPBinArithHelper(config,
Martyn Capewell37f28182020-01-14 10:15:10 +000012445 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012446 &MacroAssembler::Fdiv,
12447 kDRegSize,
12448 zd_in,
12449 pg_in,
12450 zn_in,
12451 zm_in,
12452 exp_d);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012453}
12454
Martyn Capewell9cc3f142019-10-29 14:06:35 +000012455TEST_SVE(sve_select) {
12456 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12457 START();
12458
12459 uint64_t in0[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
12460 uint64_t in1[] = {0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa};
12461
12462 // For simplicity, we re-use the same pg for various lane sizes.
12463 // For D lanes: 1, 1, 0
12464 // For S lanes: 1, 1, 1, 0, 0
12465 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
12466 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
12467 Initialise(&masm, p0.VnB(), pg_in);
12468 PRegisterM pg = p0.Merging();
12469
12470 InsrHelper(&masm, z30.VnD(), in0);
12471 InsrHelper(&masm, z31.VnD(), in1);
12472
12473 __ Sel(z0.VnB(), pg, z30.VnB(), z31.VnB());
12474 __ Sel(z1.VnH(), pg, z30.VnH(), z31.VnH());
12475 __ Sel(z2.VnS(), pg, z30.VnS(), z31.VnS());
12476 __ Sel(z3.VnD(), pg, z30.VnD(), z31.VnD());
12477
12478 END();
12479
12480 if (CAN_RUN()) {
12481 RUN();
12482
12483 uint64_t expected_z0[] = {0xaaaaaaaa05aa07f8,
12484 0xfeaaaaf0aac3870f,
12485 0xaaaa56aa9abcdeaa};
12486 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
12487
12488 uint64_t expected_z1[] = {0xaaaaaaaaaaaa07f8,
12489 0xaaaaf8f0e1c3870f,
12490 0xaaaaaaaa9abcaaaa};
12491 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
12492
12493 uint64_t expected_z2[] = {0xaaaaaaaa05f607f8,
12494 0xfefcf8f0e1c3870f,
12495 0xaaaaaaaaaaaaaaaa};
12496 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
12497
12498 uint64_t expected_z3[] = {0x01f203f405f607f8,
12499 0xfefcf8f0e1c3870f,
12500 0xaaaaaaaaaaaaaaaa};
12501 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
12502 }
12503}
TatWai Chongd316c5e2019-10-16 12:22:10 -070012504
TatWai Chong7a0d3672019-10-23 17:35:18 -070012505TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h) {
12506 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12507 double zn_inputs[] = {-2.1,
12508 8.5,
12509 225.5,
12510 0.0,
12511 8.8,
12512 -4.75,
12513 kFP64PositiveInfinity,
12514 kFP64NegativeInfinity};
12515 double zm_inputs[] = {-2.0,
12516 -13.0,
12517 24.0,
12518 0.01,
12519 0.5,
12520 300.75,
12521 kFP64NegativeInfinity,
12522 kFP64PositiveInfinity};
12523 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12524
12525 uint16_t zd_expected_max[] = {Float16ToRawbits(Float16(-2.0)),
12526 Float16ToRawbits(Float16(8.5)),
12527 Float16ToRawbits(Float16(3.3)),
12528 Float16ToRawbits(Float16(0.01)),
12529 Float16ToRawbits(Float16(5.5)),
12530 Float16ToRawbits(Float16(300.75)),
12531 Float16ToRawbits(kFP16PositiveInfinity),
12532 Float16ToRawbits(kFP16PositiveInfinity)};
12533 FPBinArithHelper(config,
12534 &MacroAssembler::Fmax,
Martyn Capewell37f28182020-01-14 10:15:10 +000012535 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012536 kHRegSize,
12537 zd_inputs,
12538 pg_inputs,
12539 zn_inputs,
12540 zm_inputs,
12541 zd_expected_max);
12542
12543 uint16_t zd_expected_min[] = {Float16ToRawbits(Float16(-2.1)),
12544 Float16ToRawbits(Float16(-13.0)),
12545 Float16ToRawbits(Float16(3.3)),
12546 Float16ToRawbits(Float16(0.0)),
12547 Float16ToRawbits(Float16(5.5)),
12548 Float16ToRawbits(Float16(-4.75)),
12549 Float16ToRawbits(kFP16NegativeInfinity),
12550 Float16ToRawbits(kFP16NegativeInfinity)};
12551 FPBinArithHelper(config,
12552 &MacroAssembler::Fmin,
Martyn Capewell37f28182020-01-14 10:15:10 +000012553 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012554 kHRegSize,
12555 zd_inputs,
12556 pg_inputs,
12557 zn_inputs,
12558 zm_inputs,
12559 zd_expected_min);
12560}
12561
12562TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s) {
12563 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12564 double zn_inputs[] = {-2.1,
12565 8.5,
12566 225.5,
12567 0.0,
12568 8.8,
12569 -4.75,
12570 kFP64PositiveInfinity,
12571 kFP64NegativeInfinity};
12572 double zm_inputs[] = {-2.0,
12573 -13.0,
12574 24.0,
12575 0.01,
12576 0.5,
12577 300.75,
12578 kFP64NegativeInfinity,
12579 kFP64PositiveInfinity};
12580 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12581
12582 uint32_t zd_expected_max[] = {FloatToRawbits(-2.0),
12583 FloatToRawbits(8.5),
12584 FloatToRawbits(3.3),
12585 FloatToRawbits(0.01),
12586 FloatToRawbits(5.5),
12587 FloatToRawbits(300.75),
12588 FloatToRawbits(kFP32PositiveInfinity),
12589 FloatToRawbits(kFP32PositiveInfinity)};
12590 FPBinArithHelper(config,
12591 &MacroAssembler::Fmax,
Martyn Capewell37f28182020-01-14 10:15:10 +000012592 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012593 kSRegSize,
12594 zd_inputs,
12595 pg_inputs,
12596 zn_inputs,
12597 zm_inputs,
12598 zd_expected_max);
12599
12600 uint32_t zd_expected_min[] = {FloatToRawbits(-2.1),
12601 FloatToRawbits(-13.0),
12602 FloatToRawbits(3.3),
12603 FloatToRawbits(0.0),
12604 FloatToRawbits(5.5),
12605 FloatToRawbits(-4.75),
12606 FloatToRawbits(kFP32NegativeInfinity),
12607 FloatToRawbits(kFP32NegativeInfinity)};
12608 FPBinArithHelper(config,
12609 &MacroAssembler::Fmin,
Martyn Capewell37f28182020-01-14 10:15:10 +000012610 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012611 kSRegSize,
12612 zd_inputs,
12613 pg_inputs,
12614 zn_inputs,
12615 zm_inputs,
12616 zd_expected_min);
12617}
12618
12619TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d) {
12620 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12621 double zn_inputs[] = {-2.1,
12622 8.5,
12623 225.5,
12624 0.0,
12625 8.8,
12626 -4.75,
12627 kFP64PositiveInfinity,
12628 kFP64NegativeInfinity};
12629 double zm_inputs[] = {-2.0,
12630 -13.0,
12631 24.0,
12632 0.01,
12633 0.5,
12634 300.75,
12635 kFP64NegativeInfinity,
12636 kFP64PositiveInfinity};
12637 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12638
12639 uint64_t zd_expected_max[] = {DoubleToRawbits(-2.0),
12640 DoubleToRawbits(8.5),
12641 DoubleToRawbits(3.3),
12642 DoubleToRawbits(0.01),
12643 DoubleToRawbits(5.5),
12644 DoubleToRawbits(300.75),
12645 DoubleToRawbits(kFP64PositiveInfinity),
12646 DoubleToRawbits(kFP64PositiveInfinity)};
12647 FPBinArithHelper(config,
12648 &MacroAssembler::Fmax,
Martyn Capewell37f28182020-01-14 10:15:10 +000012649 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012650 kDRegSize,
12651 zd_inputs,
12652 pg_inputs,
12653 zn_inputs,
12654 zm_inputs,
12655 zd_expected_max);
12656
12657 uint64_t zd_expected_min[] = {DoubleToRawbits(-2.1),
12658 DoubleToRawbits(-13.0),
12659 DoubleToRawbits(3.3),
12660 DoubleToRawbits(0.0),
12661 DoubleToRawbits(5.5),
12662 DoubleToRawbits(-4.75),
12663 DoubleToRawbits(kFP64NegativeInfinity),
12664 DoubleToRawbits(kFP64NegativeInfinity)};
12665 FPBinArithHelper(config,
12666 &MacroAssembler::Fmin,
Martyn Capewell37f28182020-01-14 10:15:10 +000012667 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012668 kDRegSize,
12669 zd_inputs,
12670 pg_inputs,
12671 zn_inputs,
12672 zm_inputs,
12673 zd_expected_min);
12674}
TatWai Chong29a0c432019-11-06 22:20:44 -080012675
12676template <typename T, size_t N>
12677static void BitwiseShiftImmHelper(Test* config,
12678 int lane_size_in_bits,
12679 const T (&zn_inputs)[N],
12680 int shift) {
12681 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12682 START();
12683
12684 ZRegister zd_asr = z25.WithLaneSize(lane_size_in_bits);
12685 ZRegister zd_lsr = z26.WithLaneSize(lane_size_in_bits);
12686 ZRegister zd_lsl = z27.WithLaneSize(lane_size_in_bits);
12687 ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
12688
12689 InsrHelper(&masm, zn, zn_inputs);
12690
12691 __ Asr(zd_asr, zn, shift);
12692 __ Lsr(zd_lsr, zn, shift);
Martyn Capewell147b0ba2020-02-19 11:16:02 +000012693 __ Lsl(zd_lsl, zn, shift - 1); // Lsl supports 0 - lane_size-1.
TatWai Chong29a0c432019-11-06 22:20:44 -080012694
12695 END();
12696
12697 if (CAN_RUN()) {
12698 RUN();
12699
12700 const uint64_t mask = GetUintMask(lane_size_in_bits);
12701 for (int i = 0; i < static_cast<int>(N); i++) {
12702 int lane = N - i - 1;
12703 if (!core.HasSVELane(zd_asr, lane)) break;
12704 bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12705 uint64_t result;
12706 if (shift >= lane_size_in_bits) {
12707 result = is_negative ? mask : 0;
12708 } else {
12709 result = zn_inputs[i] >> shift;
12710 if (is_negative) {
12711 result |= mask << (lane_size_in_bits - shift);
12712 result &= mask;
12713 }
12714 }
12715 ASSERT_EQUAL_SVE_LANE(result, zd_asr, lane);
12716 }
12717
12718 for (int i = 0; i < static_cast<int>(N); i++) {
12719 int lane = N - i - 1;
12720 if (!core.HasSVELane(zd_lsr, lane)) break;
12721 uint64_t result =
12722 (shift >= lane_size_in_bits) ? 0 : zn_inputs[i] >> shift;
12723 ASSERT_EQUAL_SVE_LANE(result, zd_lsr, lane);
12724 }
12725
12726 for (int i = 0; i < static_cast<int>(N); i++) {
12727 int lane = N - i - 1;
12728 if (!core.HasSVELane(zd_lsl, lane)) break;
Jacob Bramley504d5e92020-05-21 11:40:21 +010012729 uint64_t result =
12730 (shift > lane_size_in_bits) ? 0 : zn_inputs[i] << (shift - 1);
TatWai Chong29a0c432019-11-06 22:20:44 -080012731 ASSERT_EQUAL_SVE_LANE(result & mask, zd_lsl, lane);
12732 }
12733 }
12734}
12735
12736TEST_SVE(sve_bitwise_shift_imm_unpredicated) {
12737 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12738 int shift_b[] = {1, 3, 5, 8};
12739 for (size_t i = 0; i < ArrayLength(shift_b); i++) {
12740 BitwiseShiftImmHelper(config, kBRegSize, inputs_b, shift_b[i]);
12741 }
12742
12743 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233};
12744 int shift_h[] = {1, 8, 11, 16};
12745 for (size_t i = 0; i < ArrayLength(shift_h); i++) {
12746 BitwiseShiftImmHelper(config, kHRegSize, inputs_h, shift_h[i]);
12747 }
12748
12749 uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233};
12750 int shift_s[] = {1, 9, 17, 32};
12751 for (size_t i = 0; i < ArrayLength(shift_s); i++) {
12752 BitwiseShiftImmHelper(config, kSRegSize, inputs_s, shift_s[i]);
12753 }
12754
12755 uint64_t inputs_d[] = {0xfedcba98fedcba98,
12756 0xfffa5555aaaaaaaa,
12757 0x0011223344aafe80};
12758 int shift_d[] = {1, 23, 45, 64};
12759 for (size_t i = 0; i < ArrayLength(shift_d); i++) {
12760 BitwiseShiftImmHelper(config, kDRegSize, inputs_d, shift_d[i]);
12761 }
12762}
12763
12764template <typename T, typename R, size_t N>
12765static void BitwiseShiftWideElementsHelper(Test* config,
12766 Shift shift_type,
12767 int lane_size_in_bits,
12768 const T (&zn_inputs)[N],
12769 const R& zm_inputs,
12770 const T (&zd_expected)[N]) {
12771 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12772 START();
12773
12774 ArithFn macro;
12775 // Since logical shift left and right by the current lane size width is equal
12776 // to 0, so initialize the array to 0 for convenience.
12777 uint64_t zd_expected_max_shift_amount[N] = {0};
12778 switch (shift_type) {
12779 case ASR: {
12780 macro = &MacroAssembler::Asr;
12781 uint64_t mask = GetUintMask(lane_size_in_bits);
12782 for (size_t i = 0; i < ArrayLength(zn_inputs); i++) {
12783 bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12784 zd_expected_max_shift_amount[i] = is_negative ? mask : 0;
12785 }
12786 break;
12787 }
12788 case LSR:
12789 macro = &MacroAssembler::Lsr;
12790 break;
12791 case LSL:
12792 macro = &MacroAssembler::Lsl;
12793 break;
12794 default:
12795 VIXL_UNIMPLEMENTED();
12796 macro = NULL;
12797 break;
12798 }
12799
12800 ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12801 ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12802 ZRegister zm = z28.WithLaneSize(kDRegSize);
12803
12804 InsrHelper(&masm, zn, zn_inputs);
12805 InsrHelper(&masm, zm, zm_inputs);
12806
12807 (masm.*macro)(zd, zn, zm);
12808
12809 ZRegister zm_max_shift_amount = z25.WithLaneSize(kDRegSize);
12810 ZRegister zd_max_shift_amount = z24.WithLaneSize(lane_size_in_bits);
12811
12812 __ Dup(zm_max_shift_amount, lane_size_in_bits);
12813 (masm.*macro)(zd_max_shift_amount, zn, zm_max_shift_amount);
12814
12815 ZRegister zm_out_of_range = z23.WithLaneSize(kDRegSize);
12816 ZRegister zd_out_of_range = z22.WithLaneSize(lane_size_in_bits);
12817
12818 __ Dup(zm_out_of_range, GetUintMask(lane_size_in_bits));
12819 (masm.*macro)(zd_out_of_range, zn, zm_out_of_range);
12820
12821 END();
12822
12823 if (CAN_RUN()) {
12824 RUN();
12825
12826 ASSERT_EQUAL_SVE(zd_expected, zd);
12827 ASSERT_EQUAL_SVE(zd_expected_max_shift_amount, zd_max_shift_amount);
12828 ASSERT_EQUAL_SVE(zd_max_shift_amount, zd_out_of_range);
12829 }
12830}
12831
12832TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr) {
12833 // clang-format off
12834 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12835 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12836 int shift_b[] = {1, 3};
12837 uint64_t expected_b[] = {0xff, 0xee, 0xdd, 0xcc, 0xff, 0x2a, 0xd5, 0xc0,
12838 0xff, 0xfb, 0xf7, 0xf3, 0xff, 0x0a, 0xf5, 0xf0};
12839 BitwiseShiftWideElementsHelper(config,
12840 ASR,
12841 kBRegSize,
12842 inputs_b,
12843 shift_b,
12844 expected_b);
12845
12846 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12847 0xfedc, 0xfa55, 0x0011, 0x2233,
12848 0xfedc, 0xfa55, 0x0011, 0x2233};
12849 int shift_h[] = {1, 8, 11};
12850 uint64_t expected_h[] = {0xff6e, 0xfd2a, 0x0008, 0x1119,
12851 0xfffe, 0xfffa, 0x0000, 0x0022,
12852 0xffff, 0xffff, 0x0000, 0x0004};
12853 BitwiseShiftWideElementsHelper(config,
12854 ASR,
12855 kHRegSize,
12856 inputs_h,
12857 shift_h,
12858 expected_h);
12859
12860 uint64_t inputs_s[] =
12861 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12862 int shift_s[] = {1, 9, 23};
12863 uint64_t expected_s[] =
12864 {0xff6e5d4c, 0xfffd2ad5, 0x00000891, 0x000091a2, 0xffffff55, 0xffffff11};
12865 BitwiseShiftWideElementsHelper(config,
12866 ASR,
12867 kSRegSize,
12868 inputs_s,
12869 shift_s,
12870 expected_s);
12871 // clang-format on
12872}
12873
12874TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr) {
12875 // clang-format off
12876 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12877 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12878 int shift_b[] = {1, 3};
12879 uint64_t expected_b[] = {0x7f, 0x6e, 0x5d, 0x4c, 0x7f, 0x2a, 0x55, 0x40,
12880 0x1f, 0x1b, 0x17, 0x13, 0x1f, 0x0a, 0x15, 0x10};
12881
12882 BitwiseShiftWideElementsHelper(config,
12883 LSR,
12884 kBRegSize,
12885 inputs_b,
12886 shift_b,
12887 expected_b);
12888
12889 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12890 0xfedc, 0xfa55, 0x0011, 0x2233,
12891 0xfedc, 0xfa55, 0x0011, 0x2233};
12892 int shift_h[] = {1, 8, 11};
12893 uint64_t expected_h[] = {0x7f6e, 0x7d2a, 0x0008, 0x1119,
12894 0x00fe, 0x00fa, 0x0000, 0x0022,
12895 0x001f, 0x001f, 0x0000, 0x0004};
12896 BitwiseShiftWideElementsHelper(config,
12897 LSR,
12898 kHRegSize,
12899 inputs_h,
12900 shift_h,
12901 expected_h);
12902
12903 uint64_t inputs_s[] =
12904 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12905 int shift_s[] = {1, 9, 23};
12906 uint64_t expected_s[] =
12907 {0x7f6e5d4c, 0x7ffd2ad5, 0x00000891, 0x000091a2, 0x00000155, 0x00000111};
12908 BitwiseShiftWideElementsHelper(config,
12909 LSR,
12910 kSRegSize,
12911 inputs_s,
12912 shift_s,
12913 expected_s);
12914 // clang-format on
12915}
12916
12917TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl) {
12918 // clang-format off
12919 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12920 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12921 int shift_b[] = {1, 5};
12922
12923 uint64_t expected_b[] = {0xfc, 0xb8, 0x74, 0x30, 0xfe, 0xaa, 0x54, 0x00,
12924 0xc0, 0x80, 0x40, 0x00, 0xe0, 0xa0, 0x40, 0x00};
12925
12926 BitwiseShiftWideElementsHelper(config,
12927 LSL,
12928 kBRegSize,
12929 inputs_b,
12930 shift_b,
12931 expected_b);
12932 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12933 0xfedc, 0xfa55, 0x0011, 0x2233,
12934 0xfedc, 0xfa55, 0x0011, 0x2233};
12935 int shift_h[] = {1, 2, 14};
12936
12937 uint64_t expected_h[] = {0xfdb8, 0xf4aa, 0x0022, 0x4466,
12938 0xfb70, 0xe954, 0x0044, 0x88cc,
12939 0x0000, 0x4000, 0x4000, 0xc000};
12940 BitwiseShiftWideElementsHelper(config,
12941 LSL,
12942 kHRegSize,
12943 inputs_h,
12944 shift_h,
12945 expected_h);
12946 uint64_t inputs_s[] =
12947 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12948 int shift_s[] = {1, 19, 26};
12949 uint64_t expected_s[] =
12950 {0xfdb97530, 0xfff4ab54, 0x11980000, 0x2b380000, 0xa8000000, 0x20000000};
12951 BitwiseShiftWideElementsHelper(config,
12952 LSL,
12953 kSRegSize,
12954 inputs_s,
12955 shift_s,
12956 expected_s);
Martyn Capewell3bf2d162020-02-17 15:04:36 +000012957
12958 // Test large shifts outside the range of the "unsigned" type.
12959 uint64_t inputs_b2[] = {1, 2, 4, 8, 3, 5, 7, 9,
12960 1, 2, 4, 8, 3, 5, 7, 9};
12961 uint64_t shift_b2[] = {1, 0x1000000001};
12962 uint64_t expected_b2[] = {2, 4, 8, 16, 6, 10, 14, 18,
12963 0, 0, 0, 0, 0, 0, 0, 0};
12964 BitwiseShiftWideElementsHelper(config, LSL, kBRegSize, inputs_b2, shift_b2,
12965 expected_b2);
12966
TatWai Chong29a0c432019-11-06 22:20:44 -080012967 // clang-format on
12968}
12969
Martyn Capewell76c094a2020-02-13 17:26:49 +000012970TEST_SVE(sve_shift_by_vector) {
12971 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12972
12973 START();
12974 __ Ptrue(p0.VnB());
12975 __ Pfalse(p1.VnB());
12976 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
12977 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
12978 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
12979 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
12980
12981 __ Dup(z31.VnD(), 0x8000000080008080);
12982 __ Dup(z0.VnB(), -1);
12983
12984 __ Index(z1.VnB(), 0, 1);
12985 __ Dup(z2.VnB(), 0x55);
12986 __ Lsr(z2.VnB(), p2.Merging(), z0.VnB(), z1.VnB());
12987 __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnB());
12988 __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnB());
12989
12990 __ Index(z1.VnH(), 0, 1);
12991 __ Dup(z6.VnB(), 0x55);
12992 __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnH());
12993 __ Lsl(z6.VnH(), p3.Merging(), z0.VnH(), z1.VnH());
12994 __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnH());
12995
12996 __ Index(z1.VnS(), 0, 1);
12997 __ Dup(z10.VnB(), 0x55);
12998 __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
12999 __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
13000 __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnS());
13001
13002 __ Index(z1.VnD(), 0, 1);
13003 __ Lsr(z0.VnD(), p5.Merging(), z0.VnD(), z1.VnD());
13004 __ Lsl(z12.VnD(), p0.Merging(), z0.VnD(), z1.VnD());
13005 __ Asr(z13.VnD(), p0.Merging(), z31.VnD(), z1.VnD());
13006
13007 __ Dup(z11.VnD(), 0x100000001);
13008 __ Lsl(z14.VnD(), p0.Merging(), z1.VnD(), z11.VnD());
13009
13010 __ Index(z0.VnH(), 7, -1);
13011 __ Lsr(z0.VnH(), p0.Merging(), z31.VnH(), z0.VnH());
13012 END();
13013
13014 if (CAN_RUN()) {
13015 RUN();
13016
13017 uint64_t expected_z0[] = {0x8000000020001010, 0x0800000002000101};
13018 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13019 uint64_t expected_z2[] = {0x5500550055005500, 0x5503550f553f55ff};
13020 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13021 uint64_t expected_z3[] = {0x0000000000000000, 0x80c0e0f0f8fcfeff};
13022 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13023 uint64_t expected_z4[] = {0xff000000ff00ffff, 0xff000000f000c080};
13024 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13025 uint64_t expected_z5[] = {0x01ff03ff07ff0fff, 0x1fff3fff7fffffff};
13026 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13027 uint64_t expected_z6[] = {0x5555ffc05555fff0, 0x5555fffc5555ffff};
13028 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13029 uint64_t expected_z7[] = {0xff000000fc00f808, 0xf0000000c0008080};
13030 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13031 uint64_t expected_z8[] = {0x1fffffff3fffffff, 0x7fffffffffffffff};
13032 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13033 uint64_t expected_z9[] = {0xfffffff8fffffffc, 0xfffffffeffffffff};
13034 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13035 uint64_t expected_z10[] = {0x55555555e0002020, 0x5555555580008080};
13036 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13037 uint64_t expected_z12[] = {0xfffffffffffffffe, 0xffffffffffffffff};
13038 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13039 uint64_t expected_z13[] = {0xc000000040004040, 0x8000000080008080};
13040 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13041 uint64_t expected_z14[] = {0, 0};
13042 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13043 }
13044}
13045
13046TEST_SVE(sve_shift_by_wide_vector) {
13047 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13048
13049 START();
13050 __ Ptrue(p0.VnB());
13051 __ Pfalse(p1.VnB());
13052 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13053 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13054 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13055
13056 __ Dup(z31.VnD(), 0x8000000080008080);
13057 __ Dup(z0.VnB(), -1);
13058 __ Index(z1.VnD(), 1, 5);
13059
13060 __ Dup(z2.VnB(), 0x55);
13061 __ Lsr(z2.VnB(), p2.Merging(), z2.VnB(), z1.VnD());
13062 __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnD());
13063 __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnD());
13064
13065 __ Dup(z6.VnB(), 0x55);
13066 __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnD());
13067 __ Lsl(z6.VnH(), p3.Merging(), z6.VnH(), z1.VnD());
13068 __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnD());
13069
13070 __ Dup(z10.VnB(), 0x55);
13071 __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13072 __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13073 __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnD());
13074 END();
13075
13076 if (CAN_RUN()) {
13077 RUN();
13078
13079 uint64_t expected_z2[] = {0x5501550155015501, 0x552a552a552a552a};
13080 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13081 uint64_t expected_z3[] = {0xc0c0c0c0c0c0c0c0, 0xfefefefefefefefe};
13082 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13083 uint64_t expected_z4[] = {0xfe000000fe00fefe, 0xc0000000c000c0c0};
13084 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13085 uint64_t expected_z5[] = {0x03ff03ff03ff03ff, 0x7fff7fff7fff7fff};
13086 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13087 uint64_t expected_z6[] = {0x5555554055555540, 0x5555aaaa5555aaaa};
13088 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13089 uint64_t expected_z7[] = {0xfe000000fe00fe02, 0xc0000000c000c040};
13090 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13091 uint64_t expected_z8[] = {0x03ffffff03ffffff, 0x7fffffff7fffffff};
13092 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13093 uint64_t expected_z9[] = {0xffffffc0ffffffc0, 0xfffffffefffffffe};
13094 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13095 uint64_t expected_z10[] = {0x55555555fe000202, 0x55555555c0004040};
13096 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13097 }
13098}
13099
Martyn Capewell83e86612020-02-19 15:46:15 +000013100TEST_SVE(sve_pred_shift_imm) {
13101 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13102
13103 START();
13104 __ Ptrue(p0.VnB());
13105 __ Pfalse(p1.VnB());
13106 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13107 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13108 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13109 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13110
13111 __ Dup(z31.VnD(), 0x8000000080008080);
13112 __ Lsr(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13113 __ Mov(z1, z0);
13114 __ Lsl(z1.VnB(), p2.Merging(), z1.VnB(), 1);
13115 __ Asr(z2.VnB(), p0.Merging(), z1.VnB(), 2);
13116
13117 __ Lsr(z3.VnH(), p0.Merging(), z31.VnH(), 2);
13118 __ Mov(z4, z3);
13119 __ Lsl(z4.VnH(), p3.Merging(), z4.VnH(), 2);
13120 __ Asr(z5.VnH(), p0.Merging(), z4.VnH(), 3);
13121
13122 __ Lsr(z6.VnS(), p0.Merging(), z31.VnS(), 3);
13123 __ Mov(z7, z6);
13124 __ Lsl(z7.VnS(), p4.Merging(), z7.VnS(), 3);
13125 __ Asr(z8.VnS(), p0.Merging(), z7.VnS(), 4);
13126
13127 __ Lsr(z9.VnD(), p0.Merging(), z31.VnD(), 4);
13128 __ Mov(z10, z9);
13129 __ Lsl(z10.VnD(), p5.Merging(), z10.VnD(), 4);
13130 __ Asr(z11.VnD(), p0.Merging(), z10.VnD(), 5);
13131 END();
13132
13133 if (CAN_RUN()) {
13134 RUN();
13135 uint64_t expected_z0[] = {0x4000000040004040, 0x4000000040004040};
13136 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13137 uint64_t expected_z1[] = {0x4000000040004080, 0x4000000040004080};
13138 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13139 uint64_t expected_z2[] = {0x10000000100010e0, 0x10000000100010e0};
13140 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13141 uint64_t expected_z3[] = {0x2000000020002020, 0x2000000020002020};
13142 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13143 uint64_t expected_z4[] = {0x2000000020008080, 0x2000000020008080};
13144 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13145 uint64_t expected_z5[] = {0x040000000400f010, 0x040000000400f010};
13146 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13147 uint64_t expected_z6[] = {0x1000000010001010, 0x1000000010001010};
13148 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13149 uint64_t expected_z7[] = {0x1000000080008080, 0x1000000080008080};
13150 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13151 uint64_t expected_z8[] = {0x01000000f8000808, 0x01000000f8000808};
13152 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13153 uint64_t expected_z9[] = {0x0800000008000808, 0x0800000008000808};
13154 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13155 uint64_t expected_z10[] = {0x0800000008000808, 0x8000000080008080};
13156 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13157 uint64_t expected_z11[] = {0x0040000000400040, 0xfc00000004000404};
13158 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13159 }
13160}
13161
13162TEST_SVE(sve_asrd) {
13163 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13164
13165 START();
13166 __ Ptrue(p0.VnB());
13167 __ Pfalse(p1.VnB());
13168 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13169 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13170 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13171 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13172
13173 __ Index(z31.VnB(), 0x7f - 3, 1);
13174 __ Asrd(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13175 __ Mov(z1, z31);
13176 __ Asrd(z1.VnB(), p2.Merging(), z1.VnB(), 2);
13177 __ Asrd(z2.VnB(), p0.Merging(), z31.VnB(), 7);
13178 __ Asrd(z3.VnB(), p0.Merging(), z31.VnB(), 8);
13179
13180 __ Index(z31.VnH(), 0x7fff - 3, 1);
13181 __ Asrd(z4.VnH(), p0.Merging(), z31.VnH(), 1);
13182 __ Mov(z5, z31);
13183 __ Asrd(z5.VnH(), p3.Merging(), z5.VnH(), 2);
13184 __ Asrd(z6.VnH(), p0.Merging(), z31.VnH(), 15);
13185 __ Asrd(z7.VnH(), p0.Merging(), z31.VnH(), 16);
13186
13187 __ Index(z31.VnS(), 0x7fffffff - 1, 1);
13188 __ Asrd(z8.VnS(), p0.Merging(), z31.VnS(), 1);
13189 __ Mov(z9, z31);
13190 __ Asrd(z9.VnS(), p4.Merging(), z9.VnS(), 2);
13191 __ Asrd(z10.VnS(), p0.Merging(), z31.VnS(), 31);
13192 __ Asrd(z11.VnS(), p0.Merging(), z31.VnS(), 32);
13193
13194 __ Index(z31.VnD(), 0x7fffffffffffffff, 1);
13195 __ Asrd(z12.VnD(), p0.Merging(), z31.VnD(), 1);
13196 __ Mov(z13, z31);
13197 __ Asrd(z13.VnD(), p5.Merging(), z13.VnD(), 2);
13198 __ Asrd(z14.VnD(), p0.Merging(), z31.VnD(), 63);
13199 __ Asrd(z31.VnD(), p0.Merging(), z31.VnD(), 64);
13200 END();
13201
13202 if (CAN_RUN()) {
13203 RUN();
13204 uint64_t expected_z0[] = {0xc6c5c5c4c4c3c3c2, 0xc2c1c1c03f3f3e3e};
13205 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13206 uint64_t expected_z1[] = {0x8be389e287e285e1, 0x83e181e07f1f7d1f};
13207 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13208 uint64_t expected_z2[] = {0x0000000000000000, 0x000000ff00000000};
13209 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13210 uint64_t expected_z3[] = {0x0000000000000000, 0x0000000000000000};
13211 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13212 uint64_t expected_z4[] = {0xc002c001c001c000, 0x3fff3fff3ffe3ffe};
13213 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13214 uint64_t expected_z5[] = {0x8003e0018001e000, 0x7fff1fff7ffd1fff};
13215 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13216 uint64_t expected_z6[] = {0x000000000000ffff, 0x0000000000000000};
13217 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13218 uint64_t expected_z7[] = {0x0000000000000000, 0x0000000000000000};
13219 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13220 uint64_t expected_z8[] = {0xc0000001c0000000, 0x3fffffff3fffffff};
13221 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13222 uint64_t expected_z9[] = {0x80000001e0000000, 0x7fffffff1fffffff};
13223 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13224 uint64_t expected_z10[] = {0x00000000ffffffff, 0x0000000000000000};
13225 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13226 uint64_t expected_z11[] = {0x0000000000000000, 0x0000000000000000};
13227 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13228 uint64_t expected_z12[] = {0xc000000000000000, 0x3fffffffffffffff};
13229 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13230 uint64_t expected_z13[] = {0x8000000000000000, 0x1fffffffffffffff};
13231 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13232 uint64_t expected_z14[] = {0xffffffffffffffff, 0x0000000000000000};
13233 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13234 uint64_t expected_z31[] = {0x0000000000000000, 0x0000000000000000};
13235 ASSERT_EQUAL_SVE(expected_z31, z31.VnD());
13236 }
13237}
13238
TatWai Chong4023d7a2019-11-18 14:16:28 -080013239TEST_SVE(sve_setffr) {
13240 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13241 START();
13242
13243 __ Ptrue(p15.VnB());
13244 __ Setffr();
13245 __ Rdffr(p14.VnB());
13246
13247 END();
13248
13249 if (CAN_RUN()) {
13250 RUN();
13251
13252 ASSERT_EQUAL_SVE(p14.VnB(), p15.VnB());
13253 }
13254}
13255
13256static void WrffrHelper(Test* config, unsigned active_lanes) {
13257 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13258 START();
13259
13260 int inputs[kPRegMaxSize] = {0};
13261 VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13262 for (unsigned i = 0; i < active_lanes; i++) {
13263 // The rightmost (highest-indexed) array element maps to the lowest-numbered
13264 // lane.
13265 inputs[kPRegMaxSize - i - 1] = 1;
13266 }
13267
13268 Initialise(&masm, p1.VnB(), inputs);
13269 __ Wrffr(p1.VnB());
13270 __ Rdffr(p2.VnB());
13271
13272 END();
13273
13274 if (CAN_RUN()) {
13275 RUN();
13276
13277 ASSERT_EQUAL_SVE(p1.VnB(), p2.VnB());
13278 }
13279}
13280
13281TEST_SVE(sve_wrffr) {
13282 int active_lanes_inputs[] = {0, 1, 7, 10, 32, 48, kPRegMaxSize};
13283 for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13284 WrffrHelper(config, active_lanes_inputs[i]);
13285 }
13286}
13287
TatWai Chonga3e8b172019-11-22 21:48:56 -080013288template <size_t N>
13289static void RdffrHelper(Test* config,
13290 size_t active_lanes,
13291 const int (&pg_inputs)[N]) {
13292 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13293 START();
13294
13295 VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13296
13297 // The rightmost (highest-indexed) array element maps to the lowest-numbered
13298 // lane.
13299 int pd[kPRegMaxSize] = {0};
13300 for (unsigned i = 0; i < active_lanes; i++) {
13301 pd[kPRegMaxSize - i - 1] = 1;
13302 }
13303
13304 int pg[kPRegMaxSize] = {0};
13305 for (unsigned i = 0; i < N; i++) {
13306 pg[kPRegMaxSize - i - 1] = pg_inputs[i];
13307 }
13308
13309 int pd_expected[kPRegMaxSize] = {0};
13310 for (unsigned i = 0; i < std::min(active_lanes, N); i++) {
13311 int lane = kPRegMaxSize - i - 1;
13312 pd_expected[lane] = pd[lane] & pg[lane];
13313 }
13314
13315 Initialise(&masm, p0.VnB(), pg);
13316 Initialise(&masm, p1.VnB(), pd);
13317
13318 // The unpredicated form of rdffr has been tested in `WrffrHelper`.
13319 __ Wrffr(p1.VnB());
13320 __ Rdffr(p14.VnB(), p0.Zeroing());
13321 __ Rdffrs(p13.VnB(), p0.Zeroing());
13322 __ Mrs(x8, NZCV);
13323
13324 END();
13325
13326 if (CAN_RUN()) {
13327 RUN();
13328
13329 ASSERT_EQUAL_SVE(pd_expected, p14.VnB());
13330 ASSERT_EQUAL_SVE(pd_expected, p13.VnB());
13331 StatusFlags nzcv_expected =
13332 GetPredTestFlags(pd_expected, pg, core.GetSVELaneCount(kBRegSize));
13333 ASSERT_EQUAL_64(nzcv_expected, x8);
13334 }
13335}
13336
13337TEST_SVE(sve_rdffr_rdffrs) {
13338 // clang-format off
13339 int active_lanes_inputs[] = {0, 1, 15, 26, 39, 47, kPRegMaxSize};
13340 int pg_inputs_0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13341 int pg_inputs_1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13342 int pg_inputs_2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13343 int pg_inputs_3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
13344 int pg_inputs_4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13345 // clang-format on
13346
13347 for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13348 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_0);
13349 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_1);
13350 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_2);
13351 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_3);
13352 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_4);
13353 }
13354}
13355
TatWai Chong38303d92019-12-02 15:49:29 -080013356typedef void (MacroAssembler::*BrkpFn)(const PRegisterWithLaneSize& pd,
13357 const PRegisterZ& pg,
13358 const PRegisterWithLaneSize& pn,
13359 const PRegisterWithLaneSize& pm);
13360
13361template <typename Tg, typename Tn, typename Td>
13362static void BrkpaBrkpbHelper(Test* config,
13363 BrkpFn macro,
13364 BrkpFn macro_set_flags,
13365 const Tg& pg_inputs,
13366 const Tn& pn_inputs,
13367 const Tn& pm_inputs,
13368 const Td& pd_expected) {
13369 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13370 START();
13371
13372 PRegister pg = p15;
13373 PRegister pn = p14;
13374 PRegister pm = p13;
13375 Initialise(&masm, pg.VnB(), pg_inputs);
13376 Initialise(&masm, pn.VnB(), pn_inputs);
13377 Initialise(&masm, pm.VnB(), pm_inputs);
13378
13379 // Initialise NZCV to an impossible value, to check that we actually write it.
13380 __ Mov(x10, NZCVFlag);
13381 __ Msr(NZCV, x10);
13382
13383 (masm.*macro_set_flags)(p0.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13384 __ Mrs(x0, NZCV);
13385
13386 (masm.*macro)(p1.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13387
13388 END();
13389
13390 if (CAN_RUN()) {
13391 RUN();
13392
13393 ASSERT_EQUAL_SVE(pd_expected, p0.VnB());
13394
13395 // Check that the flags were properly set.
13396 StatusFlags nzcv_expected =
13397 GetPredTestFlags(pd_expected,
13398 pg_inputs,
13399 core.GetSVELaneCount(kBRegSize));
13400 ASSERT_EQUAL_64(nzcv_expected, x0);
13401 ASSERT_EQUAL_SVE(p0.VnB(), p1.VnB());
13402 }
13403}
13404
13405template <typename Tg, typename Tn, typename Td>
13406static void BrkpaHelper(Test* config,
13407 const Tg& pg_inputs,
13408 const Tn& pn_inputs,
13409 const Tn& pm_inputs,
13410 const Td& pd_expected) {
13411 BrkpaBrkpbHelper(config,
13412 &MacroAssembler::Brkpa,
13413 &MacroAssembler::Brkpas,
13414 pg_inputs,
13415 pn_inputs,
13416 pm_inputs,
13417 pd_expected);
13418}
13419
13420template <typename Tg, typename Tn, typename Td>
13421static void BrkpbHelper(Test* config,
13422 const Tg& pg_inputs,
13423 const Tn& pn_inputs,
13424 const Tn& pm_inputs,
13425 const Td& pd_expected) {
13426 BrkpaBrkpbHelper(config,
13427 &MacroAssembler::Brkpb,
13428 &MacroAssembler::Brkpbs,
13429 pg_inputs,
13430 pn_inputs,
13431 pm_inputs,
13432 pd_expected);
13433}
13434
13435TEST_SVE(sve_brkpb) {
13436 // clang-format off
13437 // The last active element of `pn` are `true` in all vector length configurations.
13438 // | boundary of 128-bits VL.
13439 // v
13440 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13441 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13442 int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13443
13444 // | highest-numbered lane lowest-numbered lane |
13445 // v v
13446 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13447 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13448 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13449
13450 int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13451 int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13452 int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13453
13454 // | first active
13455 // v
13456 int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13457 // | first active
13458 // v
13459 int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13460 // | first active
13461 // v
13462 int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13463
13464 BrkpbHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13465 BrkpbHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13466 BrkpbHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13467
13468 // | first active
13469 // v
13470 int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13471 // | first active
13472 // v
13473 int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13474 // | first active
13475 // v
13476 int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
13477 BrkpbHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13478 BrkpbHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13479 BrkpbHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13480
13481 // | first active
13482 // v
13483 int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13484 // | first active
13485 // v
13486 int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13487 // | first active
13488 // v
13489 int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13490 BrkpbHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13491 BrkpbHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13492 BrkpbHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13493
13494 // The last active element of `pn` are `false` in all vector length configurations.
13495 // | last active lane when VL > 128 bits.
13496 // v
13497 // | last active lane when VL == 128 bits.
13498 // v
13499 int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13500 int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13501 BrkpbHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13502 BrkpbHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13503 BrkpbHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13504 // clang-format on
13505}
13506
13507TEST_SVE(sve_brkpa) {
13508 // clang-format off
13509 // The last active element of `pn` are `true` in all vector length configurations.
13510 // | boundary of 128-bits VL.
13511 // v
13512 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13513 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13514 int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13515
13516 // | highest-numbered lane lowest-numbered lane |
13517 // v v
13518 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13519 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13520 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13521
13522 int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13523 int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13524 int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13525
13526 // | first active
13527 // v
13528 int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13529 // | first active
13530 // v
13531 int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13532 // | first active
13533 // v
13534 int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13535
13536 BrkpaHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13537 BrkpaHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13538 BrkpaHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13539
13540 // | first active
13541 // v
13542 int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13543 // | first active
13544 // v
13545 int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13546 // | first active
13547 // v
13548 int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
13549 BrkpaHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13550 BrkpaHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13551 BrkpaHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13552
13553 // | first active
13554 // v
13555 int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13556 // | first active
13557 // v
13558 int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13559 // | first active
13560 // v
13561 int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13562 BrkpaHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13563 BrkpaHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13564 BrkpaHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13565
13566 // The last active element of `pn` are `false` in all vector length configurations.
13567 // | last active lane when VL > 128 bits.
13568 // v
13569 // | last active lane when VL == 128 bits.
13570 // v
13571 int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13572 int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13573 BrkpaHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13574 BrkpaHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13575 BrkpaHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13576 // clang-format on
13577}
13578
Martyn Capewell77b6d982019-12-02 18:34:59 +000013579TEST_SVE(sve_rbit) {
13580 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13581 START();
13582
13583 uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13584 InsrHelper(&masm, z0.VnD(), inputs);
13585
13586 __ Ptrue(p1.VnB());
13587 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13588 Initialise(&masm, p2.VnB(), pred);
13589
13590 __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13591 __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13592
13593 __ Rbit(z1.VnB(), p1.Merging(), z0.VnB());
13594 __ Rbit(z2.VnH(), p1.Merging(), z0.VnH());
13595 __ Rbit(z3.VnS(), p1.Merging(), z0.VnS());
13596 __ Rbit(z4.VnD(), p1.Merging(), z0.VnD());
13597
13598 __ Dup(z5.VnB(), 0x42);
13599 __ Rbit(z5.VnB(), p2.Merging(), z0.VnB());
13600 __ Dup(z6.VnB(), 0x42);
13601 __ Rbit(z6.VnS(), p2.Merging(), z0.VnS());
13602
13603 END();
13604
13605 if (CAN_RUN()) {
13606 RUN();
13607
13608 ASSERT_EQUAL_SVE(inputs, z0.VnD());
13609
13610 uint64_t expected_z1[] = {0x55555555aaaaaaaa, 0x5555aaaa55aa55aa};
13611 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13612 uint64_t expected_z2[] = {0x55555555aaaaaaaa, 0x5555aaaaaa55aa55};
13613 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13614 uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0xaaaa5555aa55aa55};
13615 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13616 uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0xaa55aa55aaaa5555};
13617 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13618 uint64_t expected_z5[] = {0x4255425542aa42aa, 0x4255424242aa42aa};
13619 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13620 uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0x42424242aa55aa55};
13621 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13622 }
13623}
13624
13625TEST_SVE(sve_rev_bhw) {
13626 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13627 START();
13628
13629 uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13630 InsrHelper(&masm, z0.VnD(), inputs);
13631
13632 __ Ptrue(p1.VnB());
13633 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13634 Initialise(&masm, p2.VnB(), pred);
13635
13636 __ Revb(z1.VnH(), p1.Merging(), z0.VnH());
13637 __ Revb(z2.VnS(), p1.Merging(), z0.VnS());
13638 __ Revb(z3.VnD(), p1.Merging(), z0.VnD());
13639 __ Revh(z4.VnS(), p1.Merging(), z0.VnS());
13640 __ Revh(z5.VnD(), p1.Merging(), z0.VnD());
13641 __ Revw(z6.VnD(), p1.Merging(), z0.VnD());
13642
13643 __ Dup(z7.VnB(), 0x42);
13644 __ Revb(z7.VnH(), p2.Merging(), z0.VnH());
13645 __ Dup(z8.VnB(), 0x42);
13646 __ Revh(z8.VnS(), p2.Merging(), z0.VnS());
13647
13648 END();
13649
13650 if (CAN_RUN()) {
13651 RUN();
13652
13653 uint64_t expected_z1[] = {0xaaaaaaaa55555555, 0xaaaa555555aa55aa};
13654 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13655 uint64_t expected_z2[] = {0xaaaaaaaa55555555, 0x5555aaaa55aa55aa};
13656 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13657 uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0x55aa55aa5555aaaa};
13658 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13659 uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0x5555aaaaaa55aa55};
13660 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13661 uint64_t expected_z5[] = {0x55555555aaaaaaaa, 0xaa55aa555555aaaa};
13662 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13663 uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0xaa55aa55aaaa5555};
13664 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13665 uint64_t expected_z7[] = {0xaaaaaaaa55555555, 0xaaaa424255aa55aa};
13666 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13667 uint64_t expected_z8[] = {0xaaaaaaaa55555555, 0x42424242aa55aa55};
13668 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13669 }
13670}
13671
Martyn Capewell43782632019-12-12 13:22:10 +000013672TEST_SVE(sve_ftssel) {
13673 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13674 START();
13675
13676 uint64_t in[] = {0x1111777766665555, 0xaaaabbbbccccdddd};
13677 uint64_t q[] = {0x0001000300000002, 0x0001000200000003};
13678 InsrHelper(&masm, z0.VnD(), in);
13679 InsrHelper(&masm, z1.VnD(), q);
13680
13681 __ Ftssel(z2.VnH(), z0.VnH(), z1.VnH());
13682 __ Ftssel(z3.VnS(), z0.VnS(), z1.VnS());
13683 __ Ftssel(z4.VnD(), z0.VnD(), z1.VnD());
13684
13685 END();
13686
13687 if (CAN_RUN()) {
13688 RUN();
13689
13690 uint64_t expected_z2[] = {0x3c00bc006666d555, 0x3c003bbbccccbc00};
13691 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13692 uint64_t expected_z3[] = {0xbf800000e6665555, 0x2aaabbbbbf800000};
13693 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13694 uint64_t expected_z4[] = {0x9111777766665555, 0xbff0000000000000};
13695 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13696 }
13697}
13698
13699TEST_SVE(sve_fexpa) {
13700 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13701 START();
13702
13703 uint64_t in0[] = {0x3ff0000000000000, 0x3ff0000000011001};
13704 uint64_t in1[] = {0x3ff000000002200f, 0xbff000000003301f};
13705 uint64_t in2[] = {0xbff000000004403f, 0x3ff0000000055040};
13706 uint64_t in3[] = {0x3f800000bf800001, 0x3f80000f3f80001f};
13707 uint64_t in4[] = {0x3f80002f3f82203f, 0xbf8000403f833041};
13708 uint64_t in5[] = {0x3c003c01bc00bc07, 0x3c08bc0f3c1fbc20};
13709 InsrHelper(&masm, z0.VnD(), in0);
13710 InsrHelper(&masm, z1.VnD(), in1);
13711 InsrHelper(&masm, z2.VnD(), in2);
13712 InsrHelper(&masm, z3.VnD(), in3);
13713 InsrHelper(&masm, z4.VnD(), in4);
13714 InsrHelper(&masm, z5.VnD(), in5);
13715
13716 __ Fexpa(z6.VnD(), z0.VnD());
13717 __ Fexpa(z7.VnD(), z1.VnD());
13718 __ Fexpa(z8.VnD(), z2.VnD());
13719 __ Fexpa(z9.VnS(), z3.VnS());
13720 __ Fexpa(z10.VnS(), z4.VnS());
13721 __ Fexpa(z11.VnH(), z5.VnH());
13722
13723 END();
13724
13725 if (CAN_RUN()) {
13726 RUN();
13727 uint64_t expected_z6[] = {0x0000000000000000, 0x44002c9a3e778061};
13728 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13729 uint64_t expected_z7[] = {0x0802d285a6e4030b, 0x4c06623882552225};
13730 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13731 uint64_t expected_z8[] = {0x100fa7c1819e90d8, 0x5410000000000000};
13732 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13733 uint64_t expected_z9[] = {0x00000000000164d2, 0x0016942d003311c4};
13734 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13735 uint64_t expected_z10[] = {0x0054f35b407d3e0c, 0x00800000608164d2};
13736 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13737 uint64_t expected_z11[] = {0x00000016000000a8, 0x00c2018903d40400};
13738 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13739 }
13740}
13741
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000013742TEST_SVE(sve_rev_p) {
13743 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13744 START();
13745
13746 Initialise(&masm,
13747 p0.VnB(),
13748 0xabcdabcdabcdabcd,
13749 0xabcdabcdabcdabcd,
13750 0xabcdabcdabcdabcd,
13751 0xabcdabcdabcdabcd);
13752
13753 __ Rev(p1.VnB(), p0.VnB());
13754 __ Rev(p2.VnH(), p0.VnH());
13755 __ Rev(p3.VnS(), p0.VnS());
13756 __ Rev(p4.VnD(), p0.VnD());
13757
13758 END();
13759
13760 if (CAN_RUN()) {
13761 RUN();
13762
13763 int p1_expected[] = {1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1};
13764 ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
13765 int p2_expected[] = {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0};
13766 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13767 int p3_expected[] = {1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0};
13768 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13769 int p4_expected[] = {1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1};
13770 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13771 }
13772}
13773
13774TEST_SVE(sve_trn_p_bh) {
13775 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13776 START();
13777
13778 Initialise(&masm, p0.VnB(), 0xa5a55a5a);
13779 __ Pfalse(p1.VnB());
13780
13781 __ Trn1(p2.VnB(), p0.VnB(), p0.VnB());
13782 __ Trn2(p3.VnB(), p0.VnB(), p0.VnB());
13783 __ Trn1(p4.VnB(), p1.VnB(), p0.VnB());
13784 __ Trn2(p5.VnB(), p1.VnB(), p0.VnB());
13785 __ Trn1(p6.VnB(), p0.VnB(), p1.VnB());
13786 __ Trn2(p7.VnB(), p0.VnB(), p1.VnB());
13787
13788 __ Trn1(p8.VnH(), p0.VnH(), p0.VnH());
13789 __ Trn2(p9.VnH(), p0.VnH(), p0.VnH());
13790 __ Trn1(p10.VnH(), p1.VnH(), p0.VnH());
13791 __ Trn2(p11.VnH(), p1.VnH(), p0.VnH());
13792 __ Trn1(p12.VnH(), p0.VnH(), p1.VnH());
13793 __ Trn2(p13.VnH(), p0.VnH(), p1.VnH());
13794
13795 END();
13796
13797 if (CAN_RUN()) {
13798 RUN();
13799 int p2_expected[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
13800 int p3_expected[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
13801 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13802 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13803
13804 int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13805 int p5_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13806 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13807 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13808
13809 int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0};
13810 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
13811 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13812 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13813
13814 int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13815 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13816 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13817 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13818
13819 int p10_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13820 int p11_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13821 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13822 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13823
13824 int p12_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13825 int p13_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13826 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13827 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13828 }
13829}
13830
13831TEST_SVE(sve_trn_p_sd) {
13832 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13833 START();
13834
13835 Initialise(&masm, p0.VnB(), 0x55a55aaa);
13836 __ Pfalse(p1.VnB());
13837
13838 __ Trn1(p2.VnS(), p0.VnS(), p0.VnS());
13839 __ Trn2(p3.VnS(), p0.VnS(), p0.VnS());
13840 __ Trn1(p4.VnS(), p1.VnS(), p0.VnS());
13841 __ Trn2(p5.VnS(), p1.VnS(), p0.VnS());
13842 __ Trn1(p6.VnS(), p0.VnS(), p1.VnS());
13843 __ Trn2(p7.VnS(), p0.VnS(), p1.VnS());
13844
13845 __ Trn1(p8.VnD(), p0.VnD(), p0.VnD());
13846 __ Trn2(p9.VnD(), p0.VnD(), p0.VnD());
13847 __ Trn1(p10.VnD(), p1.VnD(), p0.VnD());
13848 __ Trn2(p11.VnD(), p1.VnD(), p0.VnD());
13849 __ Trn1(p12.VnD(), p0.VnD(), p1.VnD());
13850 __ Trn2(p13.VnD(), p0.VnD(), p1.VnD());
13851
13852 END();
13853
13854 if (CAN_RUN()) {
13855 RUN();
13856 int p2_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13857 int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13858 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13859 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13860
13861 int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13862 int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13863 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13864 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13865
13866 int p6_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13867 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13868 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13869 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13870
13871 int p8_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13872 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13873 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13874 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13875
13876 int p10_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13877 int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13878 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13879 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13880
13881 int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13882 int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13883 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13884 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13885 }
13886}
13887
13888TEST_SVE(sve_zip_p_bh) {
13889 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13890 START();
13891
13892 Initialise(&masm,
13893 p0.VnB(),
13894 0x5a5a5a5a5a5a5a5a,
13895 0x5a5a5a5a5a5a5a5a,
13896 0x5a5a5a5a5a5a5a5a,
13897 0x5a5a5a5a5a5a5a5a);
13898 __ Pfalse(p1.VnB());
13899
13900 __ Zip1(p2.VnB(), p0.VnB(), p0.VnB());
13901 __ Zip2(p3.VnB(), p0.VnB(), p0.VnB());
13902 __ Zip1(p4.VnB(), p1.VnB(), p0.VnB());
13903 __ Zip2(p5.VnB(), p1.VnB(), p0.VnB());
13904 __ Zip1(p6.VnB(), p0.VnB(), p1.VnB());
13905 __ Zip2(p7.VnB(), p0.VnB(), p1.VnB());
13906
13907 __ Zip1(p8.VnH(), p0.VnH(), p0.VnH());
13908 __ Zip2(p9.VnH(), p0.VnH(), p0.VnH());
13909 __ Zip1(p10.VnH(), p1.VnH(), p0.VnH());
13910 __ Zip2(p11.VnH(), p1.VnH(), p0.VnH());
13911 __ Zip1(p12.VnH(), p0.VnH(), p1.VnH());
13912 __ Zip2(p13.VnH(), p0.VnH(), p1.VnH());
13913
13914 END();
13915
13916 if (CAN_RUN()) {
13917 RUN();
13918 int p2_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13919 int p3_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13920 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13921 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13922
13923 int p4_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13924 int p5_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13925 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13926 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13927
13928 int p6_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13929 int p7_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13930 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13931 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13932
13933 int p8_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13934 int p9_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13935 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13936 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13937
13938 int p10_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13939 int p11_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13940 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13941 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13942
13943 int p12_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13944 int p13_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13945 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13946 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13947 }
13948}
13949
13950TEST_SVE(sve_zip_p_sd) {
13951 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13952 START();
13953
13954 Initialise(&masm,
13955 p0.VnB(),
13956 0x5a5a5a5a5a5a5a5a,
13957 0x5a5a5a5a5a5a5a5a,
13958 0x5a5a5a5a5a5a5a5a,
13959 0x5a5a5a5a5a5a5a5a);
13960 __ Pfalse(p1.VnB());
13961
13962 __ Zip1(p2.VnS(), p0.VnS(), p0.VnS());
13963 __ Zip2(p3.VnS(), p0.VnS(), p0.VnS());
13964 __ Zip1(p4.VnS(), p1.VnS(), p0.VnS());
13965 __ Zip2(p5.VnS(), p1.VnS(), p0.VnS());
13966 __ Zip1(p6.VnS(), p0.VnS(), p1.VnS());
13967 __ Zip2(p7.VnS(), p0.VnS(), p1.VnS());
13968
13969 __ Zip1(p8.VnD(), p0.VnD(), p0.VnD());
13970 __ Zip2(p9.VnD(), p0.VnD(), p0.VnD());
13971 __ Zip1(p10.VnD(), p1.VnD(), p0.VnD());
13972 __ Zip2(p11.VnD(), p1.VnD(), p0.VnD());
13973 __ Zip1(p12.VnD(), p0.VnD(), p1.VnD());
13974 __ Zip2(p13.VnD(), p0.VnD(), p1.VnD());
13975
13976 END();
13977
13978 if (CAN_RUN()) {
13979 RUN();
13980 int p2_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13981 int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13982 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13983 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13984
13985 int p4_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13986 int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13987 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13988 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13989
13990 int p6_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13991 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13992 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13993 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13994
13995 int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13996 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13997 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13998 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13999
14000 int p10_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14001 int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14002 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
14003 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
14004
14005 int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14006 int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14007 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
14008 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
14009 }
14010}
14011
14012TEST_SVE(sve_uzp_p) {
14013 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14014 START();
14015
14016 Initialise(&masm,
14017 p0.VnB(),
14018 0xf0f0ff00ffff0000,
14019 0x4242424242424242,
14020 0x5a5a5a5a5a5a5a5a,
14021 0x0123456789abcdef);
14022 __ Rev(p1.VnB(), p0.VnB());
14023
14024 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
14025 __ Zip2(p3.VnB(), p0.VnB(), p1.VnB());
14026 __ Uzp1(p4.VnB(), p2.VnB(), p3.VnB());
14027 __ Uzp2(p5.VnB(), p2.VnB(), p3.VnB());
14028
14029 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());
14030 __ Zip2(p3.VnH(), p0.VnH(), p1.VnH());
14031 __ Uzp1(p6.VnH(), p2.VnH(), p3.VnH());
14032 __ Uzp2(p7.VnH(), p2.VnH(), p3.VnH());
14033
14034 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14035 __ Zip2(p3.VnS(), p0.VnS(), p1.VnS());
14036 __ Uzp1(p8.VnS(), p2.VnS(), p3.VnS());
14037 __ Uzp2(p9.VnS(), p2.VnS(), p3.VnS());
14038
14039 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14040 __ Zip2(p3.VnD(), p0.VnD(), p1.VnD());
14041 __ Uzp1(p10.VnD(), p2.VnD(), p3.VnD());
14042 __ Uzp2(p11.VnD(), p2.VnD(), p3.VnD());
14043
14044 END();
14045
14046 if (CAN_RUN()) {
14047 RUN();
14048
14049 ASSERT_EQUAL_SVE(p0, p4);
14050 ASSERT_EQUAL_SVE(p1, p5);
14051 ASSERT_EQUAL_SVE(p0, p6);
14052 ASSERT_EQUAL_SVE(p1, p7);
14053 ASSERT_EQUAL_SVE(p0, p8);
14054 ASSERT_EQUAL_SVE(p1, p9);
14055 ASSERT_EQUAL_SVE(p0, p10);
14056 ASSERT_EQUAL_SVE(p1, p11);
14057 }
14058}
14059
14060TEST_SVE(sve_punpk) {
14061 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14062 START();
14063
Jacob Bramley3980b742020-07-01 12:25:54 +010014064 auto get_64_bits_at = [](int byte_index) -> uint64_t {
14065 // Each 8-bit chunk has the value 0x50 + the byte index of the chunk.
14066 return 0x5756555453525150 + (0x0101010101010101 * byte_index);
14067 };
14068
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000014069 Initialise(&masm,
14070 p0.VnB(),
Jacob Bramley3980b742020-07-01 12:25:54 +010014071 get_64_bits_at(24),
14072 get_64_bits_at(16),
14073 get_64_bits_at(8),
14074 get_64_bits_at(0));
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000014075 __ Punpklo(p1.VnH(), p0.VnB());
14076 __ Punpkhi(p2.VnH(), p0.VnB());
14077
14078 END();
14079
14080 if (CAN_RUN()) {
14081 RUN();
14082
Jacob Bramley3980b742020-07-01 12:25:54 +010014083 int pl = config->sve_vl_in_bits() / kZRegBitsPerPRegBit;
14084 // For simplicity, just test the bottom 64 H-sized lanes.
14085 uint64_t p1_h_bits = get_64_bits_at(0);
14086 uint64_t p2_h_bits = get_64_bits_at(pl / (2 * 8));
14087 int p1_expected[64];
14088 int p2_expected[64];
14089 for (size_t i = 0; i < 64; i++) {
14090 p1_expected[63 - i] = (p1_h_bits >> i) & 1;
14091 p2_expected[63 - i] = (p2_h_bits >> i) & 1;
14092 }
14093 // Testing `VnH` ensures that odd-numbered B lanes are zero.
14094 ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
14095 ASSERT_EQUAL_SVE(p2_expected, p2.VnH());
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000014096 }
14097}
14098
TatWai Chong5d872292020-01-02 15:39:51 -080014099typedef void (MacroAssembler::*BrkFn)(const PRegisterWithLaneSize& pd,
14100 const PRegister& pg,
14101 const PRegisterWithLaneSize& pn);
14102
14103typedef void (MacroAssembler::*BrksFn)(const PRegisterWithLaneSize& pd,
14104 const PRegisterZ& pg,
14105 const PRegisterWithLaneSize& pn);
14106
14107template <typename T, size_t N>
14108static void BrkaBrkbHelper(Test* config,
14109 BrkFn macro,
14110 BrksFn macro_set_flags,
14111 const T (&pd_inputs)[N],
14112 const T (&pg_inputs)[N],
14113 const T (&pn_inputs)[N],
14114 const T (&pd_z_expected)[N]) {
14115 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14116 START();
14117
14118 PRegister pg = p10;
14119 PRegister pn = p9;
14120 PRegister pd_z = p0;
14121 PRegister pd_z_s = p1;
14122 PRegister pd_m = p2;
14123 Initialise(&masm, pg.VnB(), pg_inputs);
14124 Initialise(&masm, pn.VnB(), pn_inputs);
14125 Initialise(&masm, pd_m.VnB(), pd_inputs);
14126
14127 // Initialise NZCV to an impossible value, to check that we actually write it.
14128 __ Mov(x10, NZCVFlag);
14129 __ Msr(NZCV, x10);
14130
14131 (masm.*macro)(pd_z.VnB(), pg.Zeroing(), pn.VnB());
14132 (masm.*macro_set_flags)(pd_z_s.VnB(), pg.Zeroing(), pn.VnB());
14133 __ Mrs(x0, NZCV);
14134
14135 (masm.*macro)(pd_m.VnB(), pg.Merging(), pn.VnB());
14136
14137 END();
14138
14139 if (CAN_RUN()) {
14140 RUN();
14141
14142 ASSERT_EQUAL_SVE(pd_z_expected, pd_z.VnB());
14143
14144 // Check that the flags were properly set.
14145 StatusFlags nzcv_expected =
14146 GetPredTestFlags(pd_z_expected,
14147 pg_inputs,
14148 core.GetSVELaneCount(kBRegSize));
14149 ASSERT_EQUAL_64(nzcv_expected, x0);
14150 ASSERT_EQUAL_SVE(pd_z.VnB(), pd_z_s.VnB());
14151
14152 T pd_m_expected[N];
14153 // Set expected `pd` result on merging predication.
14154 for (size_t i = 0; i < N; i++) {
14155 pd_m_expected[i] = pg_inputs[i] ? pd_z_expected[i] : pd_inputs[i];
14156 }
14157 ASSERT_EQUAL_SVE(pd_m_expected, pd_m.VnB());
14158 }
14159}
14160
14161template <typename T>
14162static void BrkaHelper(Test* config,
14163 const T& pd_inputs,
14164 const T& pg_inputs,
14165 const T& pn_inputs,
14166 const T& pd_expected) {
14167 BrkaBrkbHelper(config,
14168 &MacroAssembler::Brka,
14169 &MacroAssembler::Brkas,
14170 pd_inputs,
14171 pg_inputs,
14172 pn_inputs,
14173 pd_expected);
14174}
14175
14176TEST_SVE(sve_brka) {
14177 // clang-format off
14178 // | boundary of 128-bits VL.
14179 // v
14180 int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14181
14182 // | highest-numbered lane lowest-numbered lane |
14183 // v v
14184 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14185 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14186
14187 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14188 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14189 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14190
14191 // | first break
14192 // v
14193 int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
14194 // | first break
14195 // v
14196 int exp_1_2[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14197 // | first break
14198 // v
14199 int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14200
14201 BrkaHelper(config, pd, pg_1, pn_1, exp_1_1);
14202 BrkaHelper(config, pd, pg_1, pn_2, exp_1_2);
14203 BrkaHelper(config, pd, pg_1, pn_3, exp_1_3);
14204
14205 // | first break
14206 // v
14207 int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
14208 // | first break
14209 // v
14210 int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14211 // | first break
14212 // v
14213 int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
14214 BrkaHelper(config, pd, pg_2, pn_1, exp_2_1);
14215 BrkaHelper(config, pd, pg_2, pn_2, exp_2_2);
14216 BrkaHelper(config, pd, pg_2, pn_3, exp_2_3);
14217
14218 // The all-inactive zeroing predicate sets destination predicate all-false.
14219 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14220 int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14221 BrkaHelper(config, pd, pg_3, pn_1, exp_3_x);
14222 BrkaHelper(config, pd, pg_3, pn_2, exp_3_x);
14223 BrkaHelper(config, pd, pg_3, pn_3, exp_3_x);
14224 // clang-format on
14225}
14226
14227template <typename T>
14228static void BrkbHelper(Test* config,
14229 const T& pd_inputs,
14230 const T& pg_inputs,
14231 const T& pn_inputs,
14232 const T& pd_expected) {
14233 BrkaBrkbHelper(config,
14234 &MacroAssembler::Brkb,
14235 &MacroAssembler::Brkbs,
14236 pd_inputs,
14237 pg_inputs,
14238 pn_inputs,
14239 pd_expected);
14240}
14241
14242TEST_SVE(sve_brkb) {
14243 // clang-format off
14244 // | boundary of 128-bits VL.
14245 // v
14246 int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14247
14248 // | highest-numbered lane lowest-numbered lane |
14249 // v v
14250 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14251 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14252
14253 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14254 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14255 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14256
14257 // | first break
14258 // v
14259 int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
14260 // | first break
14261 // v
14262 int exp_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14263 // | first break
14264 // v
14265 int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
14266
14267 BrkbHelper(config, pd, pg_1, pn_1, exp_1_1);
14268 BrkbHelper(config, pd, pg_1, pn_2, exp_1_2);
14269 BrkbHelper(config, pd, pg_1, pn_3, exp_1_3);
14270
14271 // | first break
14272 // v
14273 int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
14274 // | first break
14275 // v
14276 int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14277 // | first break
14278 // v
14279 int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14280 BrkbHelper(config, pd, pg_2, pn_1, exp_2_1);
14281 BrkbHelper(config, pd, pg_2, pn_2, exp_2_2);
14282 BrkbHelper(config, pd, pg_2, pn_3, exp_2_3);
14283
14284 // The all-inactive zeroing predicate sets destination predicate all-false.
14285 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14286 int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14287 BrkbHelper(config, pd, pg_3, pn_1, exp_3_x);
14288 BrkbHelper(config, pd, pg_3, pn_2, exp_3_x);
14289 BrkbHelper(config, pd, pg_3, pn_3, exp_3_x);
14290 // clang-format on
14291}
14292
14293typedef void (MacroAssembler::*BrknFn)(const PRegisterWithLaneSize& pd,
14294 const PRegisterZ& pg,
14295 const PRegisterWithLaneSize& pn,
14296 const PRegisterWithLaneSize& pm);
14297
14298typedef void (MacroAssembler::*BrknsFn)(const PRegisterWithLaneSize& pd,
14299 const PRegisterZ& pg,
14300 const PRegisterWithLaneSize& pn,
14301 const PRegisterWithLaneSize& pm);
14302
14303enum BrknDstPredicateState { kAllFalse, kUnchanged };
14304
14305template <typename T, size_t N>
14306static void BrknHelper(Test* config,
TatWai Chong5d872292020-01-02 15:39:51 -080014307 const T (&pd_inputs)[N],
14308 const T (&pg_inputs)[N],
14309 const T (&pn_inputs)[N],
14310 const T (&pm_inputs)[N],
14311 BrknDstPredicateState expected_pd_state) {
14312 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14313 START();
14314
14315 PRegister pg = p10;
14316 PRegister pn = p9;
14317 PRegister pm = p8;
14318 PRegister pdm = p0;
14319 PRegister pd = p1;
14320 PRegister pd_s = p2;
14321 Initialise(&masm, pg.VnB(), pg_inputs);
14322 Initialise(&masm, pn.VnB(), pn_inputs);
14323 Initialise(&masm, pm.VnB(), pm_inputs);
14324 Initialise(&masm, pdm.VnB(), pm_inputs);
14325 Initialise(&masm, pd.VnB(), pd_inputs);
14326 Initialise(&masm, pd_s.VnB(), pd_inputs);
14327
14328 // Initialise NZCV to an impossible value, to check that we actually write it.
14329 __ Mov(x10, NZCVFlag);
14330 __ Msr(NZCV, x10);
14331
Jacob Bramleya3d61102020-07-01 16:49:47 +010014332 __ Brkn(pdm.VnB(), pg.Zeroing(), pn.VnB(), pdm.VnB());
TatWai Chong5d872292020-01-02 15:39:51 -080014333 // !pd.Aliases(pm).
Jacob Bramleya3d61102020-07-01 16:49:47 +010014334 __ Brkn(pd.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
14335 __ Brkns(pd_s.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
TatWai Chong5d872292020-01-02 15:39:51 -080014336 __ Mrs(x0, NZCV);
14337
14338 END();
14339
14340 if (CAN_RUN()) {
14341 RUN();
14342
14343 T all_false[N] = {0};
14344 if (expected_pd_state == kAllFalse) {
14345 ASSERT_EQUAL_SVE(all_false, pd.VnB());
14346 } else {
14347 ASSERT_EQUAL_SVE(pm_inputs, pd.VnB());
14348 }
14349 ASSERT_EQUAL_SVE(pm_inputs, pm.VnB());
14350
Jacob Bramleya3d61102020-07-01 16:49:47 +010014351 T all_true[N];
14352 for (size_t i = 0; i < ArrayLength(all_true); i++) {
14353 all_true[i] = 1;
14354 }
14355
TatWai Chong5d872292020-01-02 15:39:51 -080014356 // Check that the flags were properly set.
14357 StatusFlags nzcv_expected =
14358 GetPredTestFlags((expected_pd_state == kAllFalse) ? all_false
14359 : pm_inputs,
Jacob Bramleya3d61102020-07-01 16:49:47 +010014360 all_true,
TatWai Chong5d872292020-01-02 15:39:51 -080014361 core.GetSVELaneCount(kBRegSize));
14362 ASSERT_EQUAL_64(nzcv_expected, x0);
14363 ASSERT_EQUAL_SVE(pd.VnB(), pdm.VnB());
14364 ASSERT_EQUAL_SVE(pd.VnB(), pd_s.VnB());
14365 }
14366}
14367
14368TEST_SVE(sve_brkn) {
Jacob Bramleya3d61102020-07-01 16:49:47 +010014369 int pd[] = {1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14370 int pm[] = {0, 1, 1, 1, 1, 0, 0, 1, 0, 1};
TatWai Chong5d872292020-01-02 15:39:51 -080014371
Jacob Bramleya3d61102020-07-01 16:49:47 +010014372 int pg_1[] = {1, 1, 0, 0, 1, 0, 1, 1, 0, 0};
14373 int pg_2[] = {0, 0, 0, 1, 1, 1, 0, 0, 1, 1};
14374 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
TatWai Chong5d872292020-01-02 15:39:51 -080014375
Jacob Bramleya3d61102020-07-01 16:49:47 +010014376 int pn_1[] = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0};
14377 int pn_2[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
14378 int pn_3[] = {0, 0, 0, 0, 1, 1, 0, 0, 1, 1};
TatWai Chong5d872292020-01-02 15:39:51 -080014379
Jacob Bramleya3d61102020-07-01 16:49:47 +010014380 BrknHelper(config, pd, pg_1, pn_1, pm, kUnchanged);
14381 BrknHelper(config, pd, pg_1, pn_2, pm, kAllFalse);
14382 BrknHelper(config, pd, pg_1, pn_3, pm, kAllFalse);
TatWai Chong5d872292020-01-02 15:39:51 -080014383
Jacob Bramleya3d61102020-07-01 16:49:47 +010014384 BrknHelper(config, pd, pg_2, pn_1, pm, kAllFalse);
14385 BrknHelper(config, pd, pg_2, pn_2, pm, kUnchanged);
14386 BrknHelper(config, pd, pg_2, pn_3, pm, kAllFalse);
TatWai Chong5d872292020-01-02 15:39:51 -080014387
Jacob Bramleya3d61102020-07-01 16:49:47 +010014388 BrknHelper(config, pd, pg_3, pn_1, pm, kAllFalse);
14389 BrknHelper(config, pd, pg_3, pn_2, pm, kAllFalse);
14390 BrknHelper(config, pd, pg_3, pn_3, pm, kAllFalse);
TatWai Chong5d872292020-01-02 15:39:51 -080014391}
14392
Martyn Capewell15f89012020-01-09 11:18:30 +000014393TEST_SVE(sve_trn) {
14394 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14395 START();
14396
14397 uint64_t in0[] = {0xffeeddccbbaa9988, 0x7766554433221100};
14398 uint64_t in1[] = {0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa};
14399 InsrHelper(&masm, z0.VnD(), in0);
14400 InsrHelper(&masm, z1.VnD(), in1);
14401
14402 __ Trn1(z2.VnB(), z0.VnB(), z1.VnB());
14403 __ Trn2(z3.VnB(), z0.VnB(), z1.VnB());
14404 __ Trn1(z4.VnH(), z0.VnH(), z1.VnH());
14405 __ Trn2(z5.VnH(), z0.VnH(), z1.VnH());
14406 __ Trn1(z6.VnS(), z0.VnS(), z1.VnS());
14407 __ Trn2(z7.VnS(), z0.VnS(), z1.VnS());
14408 __ Trn1(z8.VnD(), z0.VnD(), z1.VnD());
14409 __ Trn2(z9.VnD(), z0.VnD(), z1.VnD());
14410
14411 END();
14412
14413 if (CAN_RUN()) {
14414 RUN();
14415 uint64_t expected_z2[] = {0x55ee55cc55aa5588, 0xaa66aa44aa22aa00};
14416 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14417 uint64_t expected_z3[] = {0xaaffaaddaabbaa99, 0x5577555555335511};
14418 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14419 uint64_t expected_z4[] = {0xaa55ddccaa559988, 0x55aa554455aa1100};
14420 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14421 uint64_t expected_z5[] = {0xaa55ffeeaa55bbaa, 0x55aa776655aa3322};
14422 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14423 uint64_t expected_z6[] = {0xaa55aa55bbaa9988, 0x55aa55aa33221100};
14424 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14425 uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0x55aa55aa77665544};
14426 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14427 uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14428 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14429 uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14430 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14431 }
14432}
14433
14434TEST_SVE(sve_zip_uzp) {
14435 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14436 START();
14437
14438 __ Dup(z0.VnD(), 0xffeeddccbbaa9988);
14439 __ Insr(z0.VnD(), 0x7766554433221100);
14440 __ Dup(z1.VnD(), 0xaa55aa55aa55aa55);
14441 __ Insr(z1.VnD(), 0x55aa55aa55aa55aa);
14442
14443 __ Zip1(z2.VnB(), z0.VnB(), z1.VnB());
14444 __ Zip2(z3.VnB(), z0.VnB(), z1.VnB());
14445 __ Zip1(z4.VnH(), z0.VnH(), z1.VnH());
14446 __ Zip2(z5.VnH(), z0.VnH(), z1.VnH());
14447 __ Zip1(z6.VnS(), z0.VnS(), z1.VnS());
14448 __ Zip2(z7.VnS(), z0.VnS(), z1.VnS());
14449 __ Zip1(z8.VnD(), z0.VnD(), z1.VnD());
14450 __ Zip2(z9.VnD(), z0.VnD(), z1.VnD());
14451
14452 __ Uzp1(z10.VnB(), z2.VnB(), z3.VnB());
14453 __ Uzp2(z11.VnB(), z2.VnB(), z3.VnB());
14454 __ Uzp1(z12.VnH(), z4.VnH(), z5.VnH());
14455 __ Uzp2(z13.VnH(), z4.VnH(), z5.VnH());
14456 __ Uzp1(z14.VnS(), z6.VnS(), z7.VnS());
14457 __ Uzp2(z15.VnS(), z6.VnS(), z7.VnS());
14458 __ Uzp1(z16.VnD(), z8.VnD(), z9.VnD());
14459 __ Uzp2(z17.VnD(), z8.VnD(), z9.VnD());
14460
14461 END();
14462
14463 if (CAN_RUN()) {
14464 RUN();
14465 uint64_t expected_z2[] = {0x5577aa665555aa44, 0x5533aa225511aa00};
14466 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14467 uint64_t expected_z3[] = {0xaaff55eeaadd55cc, 0xaabb55aaaa995588};
14468 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14469 uint64_t expected_z4[] = {0x55aa776655aa5544, 0x55aa332255aa1100};
14470 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14471 uint64_t expected_z5[] = {0xaa55ffeeaa55ddcc, 0xaa55bbaaaa559988};
14472 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14473 uint64_t expected_z6[] = {0x55aa55aa77665544, 0x55aa55aa33221100};
14474 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14475 uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0xaa55aa55bbaa9988};
14476 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14477 uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14478 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14479 uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14480 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14481
14482 // Check uzp is the opposite of zip.
14483 ASSERT_EQUAL_SVE(z0.VnD(), z10.VnD());
14484 ASSERT_EQUAL_SVE(z1.VnD(), z11.VnD());
14485 ASSERT_EQUAL_SVE(z0.VnD(), z12.VnD());
14486 ASSERT_EQUAL_SVE(z1.VnD(), z13.VnD());
14487 ASSERT_EQUAL_SVE(z0.VnD(), z14.VnD());
14488 ASSERT_EQUAL_SVE(z1.VnD(), z15.VnD());
14489 ASSERT_EQUAL_SVE(z0.VnD(), z16.VnD());
14490 ASSERT_EQUAL_SVE(z1.VnD(), z17.VnD());
14491 }
14492}
Martyn Capewell50e9f552020-01-07 17:45:03 +000014493
Martyn Capewell0b1afa82020-03-04 11:31:42 +000014494TEST_SVE(sve_fcadd) {
14495 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14496 START();
14497
14498 __ Dup(z30.VnS(), 0);
14499
14500 __ Ptrue(p0.VnB());
14501 __ Pfalse(p1.VnB());
14502 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
14503 __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
14504
14505 __ Fdup(z0.VnH(), 10.0); // 10i + 10
14506 __ Fdup(z1.VnH(), 5.0); // 5i + 5
14507 __ Index(z7.VnH(), 1, 1);
14508 __ Scvtf(z7.VnH(), p0.Merging(), z7.VnH()); // Ai + B
14509
14510 __ Sel(z2.VnH(), p3, z1.VnH(), z30.VnH()); // 5i + 0
14511 __ Sel(z3.VnH(), p2, z1.VnH(), z30.VnH()); // 0i + 5
14512 __ Sel(z7.VnH(), p3, z7.VnH(), z0.VnH()); // Ai + 10
14513 __ Ext(z8.VnB(), z7.VnB(), z7.VnB(), 2);
14514 __ Sel(z8.VnH(), p2, z8.VnH(), z30.VnH()); // 0i + A
14515
14516 // (10i + 10) + rotate(5i + 0, 90)
14517 // = (10i + 10) + (0i - 5)
14518 // = 10i + 5
14519 __ Fcadd(z4.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90);
14520
14521 // (10i + 5) + rotate(0i + 5, 270)
14522 // = (10i + 5) + (-5i + 0)
14523 // = 5i + 5
14524 __ Fcadd(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH(), 270);
14525
14526 // The same calculation, but selecting real/imaginary using predication.
14527 __ Mov(z5, z0);
14528 __ Fcadd(z5.VnH(), p2.Merging(), z5.VnH(), z1.VnH(), 90);
14529 __ Fcadd(z5.VnH(), p3.Merging(), z5.VnH(), z1.VnH(), 270);
14530
14531 // Reference calculation: (10i + 10) - (5i + 5)
14532 __ Fsub(z6.VnH(), z0.VnH(), z1.VnH());
14533
14534 // Calculation using varying imaginary values.
14535 // (Ai + 10) + rotate(5i + 0, 90)
14536 // = (Ai + 10) + (0i - 5)
14537 // = Ai + 5
14538 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z2.VnH(), 90);
14539
14540 // (Ai + 5) + rotate(0i + A, 270)
14541 // = (Ai + 5) + (-Ai + 0)
14542 // = 5
14543 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z8.VnH(), 270);
14544
14545 // Repeated, but for wider elements.
14546 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14547 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14548 __ Fdup(z0.VnS(), 42.0);
14549 __ Fdup(z1.VnS(), 21.0);
14550 __ Index(z11.VnS(), 1, 1);
14551 __ Scvtf(z11.VnS(), p0.Merging(), z11.VnS());
14552 __ Sel(z2.VnS(), p3, z1.VnS(), z30.VnS());
14553 __ Sel(z29.VnS(), p2, z1.VnS(), z30.VnS());
14554 __ Sel(z11.VnS(), p3, z11.VnS(), z0.VnS());
14555 __ Ext(z12.VnB(), z11.VnB(), z11.VnB(), 4);
14556 __ Sel(z12.VnS(), p2, z12.VnS(), z30.VnS());
14557 __ Fcadd(z8.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90);
14558 __ Fcadd(z8.VnS(), p0.Merging(), z8.VnS(), z29.VnS(), 270);
14559 __ Mov(z9, z0);
14560 __ Fcadd(z9.VnS(), p2.Merging(), z9.VnS(), z1.VnS(), 90);
14561 __ Fcadd(z9.VnS(), p3.Merging(), z9.VnS(), z1.VnS(), 270);
14562 __ Fsub(z10.VnS(), z0.VnS(), z1.VnS());
14563 __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z2.VnS(), 90);
14564 __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z12.VnS(), 270);
14565
14566 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14567 __ Zip1(p3.VnD(), p1.VnD(), p0.VnD());
14568 __ Fdup(z0.VnD(), -42.0);
14569 __ Fdup(z1.VnD(), -21.0);
14570 __ Index(z15.VnD(), 1, 1);
14571 __ Scvtf(z15.VnD(), p0.Merging(), z15.VnD());
14572 __ Sel(z2.VnD(), p3, z1.VnD(), z30.VnD());
14573 __ Sel(z28.VnD(), p2, z1.VnD(), z30.VnD());
14574 __ Sel(z15.VnD(), p3, z15.VnD(), z0.VnD());
14575 __ Ext(z16.VnB(), z15.VnB(), z15.VnB(), 8);
14576 __ Sel(z16.VnD(), p2, z16.VnD(), z30.VnD());
14577 __ Fcadd(z12.VnD(), p0.Merging(), z0.VnD(), z2.VnD(), 90);
14578 __ Fcadd(z12.VnD(), p0.Merging(), z12.VnD(), z28.VnD(), 270);
14579 __ Mov(z13, z0);
14580 __ Fcadd(z13.VnD(), p2.Merging(), z13.VnD(), z1.VnD(), 90);
14581 __ Fcadd(z13.VnD(), p3.Merging(), z13.VnD(), z1.VnD(), 270);
14582 __ Fsub(z14.VnD(), z0.VnD(), z1.VnD());
14583 __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z2.VnD(), 90);
14584 __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z16.VnD(), 270);
14585 END();
14586
14587 if (CAN_RUN()) {
14588 RUN();
14589 ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14590 ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14591 ASSERT_EQUAL_SVE(z3.VnH(), z7.VnH());
14592 ASSERT_EQUAL_SVE(z10.VnS(), z8.VnS());
14593 ASSERT_EQUAL_SVE(z10.VnS(), z9.VnS());
14594 ASSERT_EQUAL_SVE(z29.VnS(), z11.VnS());
14595 ASSERT_EQUAL_SVE(z14.VnD(), z12.VnD());
14596 ASSERT_EQUAL_SVE(z14.VnD(), z13.VnD());
14597 ASSERT_EQUAL_SVE(z28.VnS(), z15.VnS());
14598 }
14599}
14600
Martyn Capewelle4886e52020-03-30 09:28:52 +010014601TEST_SVE(sve_fcmla_index) {
14602 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14603 START();
14604
14605 __ Ptrue(p0.VnB());
14606
14607 __ Fdup(z0.VnH(), 10.0);
14608 __ Fdup(z2.VnH(), 2.0);
14609 __ Zip1(z0.VnH(), z0.VnH(), z2.VnH());
14610
14611 // Duplicate complex numbers across z2 segments. First segment has 1i+0,
14612 // second has 3i+2, etc.
14613 __ Index(z1.VnH(), 0, 1);
14614 __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14615 __ Zip1(z2.VnS(), z1.VnS(), z1.VnS());
14616 __ Zip1(z2.VnS(), z2.VnS(), z2.VnS());
14617
14618 // Derive a vector from z2 where only the third element in each segment
14619 // contains a complex number, with other elements zero.
14620 __ Index(z3.VnS(), 0, 1);
14621 __ And(z3.VnS(), z3.VnS(), 3);
14622 __ Cmpeq(p2.VnS(), p0.Zeroing(), z3.VnS(), 2);
14623 __ Dup(z3.VnB(), 0);
14624 __ Sel(z3.VnS(), p2, z2.VnS(), z3.VnS());
14625
14626 // Use indexed complex multiply on this vector, indexing the third element.
14627 __ Dup(z4.VnH(), 0);
14628 __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 0);
14629 __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 90);
14630
14631 // Rotate the indexed complex number and repeat, negated, and with a different
14632 // index.
14633 __ Ext(z3.VnH(), z3.VnH(), z3.VnH(), 4);
14634 __ Dup(z5.VnH(), 0);
14635 __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 180);
14636 __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 270);
14637 __ Fneg(z5.VnH(), p0.Merging(), z5.VnH());
14638
14639 // Create a reference result from a vector complex multiply.
14640 __ Dup(z6.VnH(), 0);
14641 __ Fcmla(z6.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 0);
14642 __ Fcmla(z6.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90);
14643
14644 // Repeated, but for wider elements.
14645 __ Fdup(z0.VnS(), 42.0);
14646 __ Fdup(z2.VnS(), 24.0);
14647 __ Zip1(z0.VnS(), z0.VnS(), z2.VnS());
14648 __ Index(z1.VnS(), -42, 13);
14649 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14650 __ Zip1(z2.VnD(), z1.VnD(), z1.VnD());
14651 __ Zip1(z2.VnD(), z2.VnD(), z2.VnD());
14652 __ Index(z3.VnD(), 0, 1);
14653 __ And(z3.VnD(), z3.VnD(), 1);
14654 __ Cmpeq(p2.VnD(), p0.Zeroing(), z3.VnD(), 1);
14655 __ Dup(z3.VnB(), 0);
14656 __ Sel(z3.VnD(), p2, z2.VnD(), z3.VnD());
14657 __ Dup(z7.VnS(), 0);
14658 __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 0);
14659 __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 90);
14660 __ Ext(z3.VnB(), z3.VnB(), z3.VnB(), 8);
14661 __ Dup(z8.VnS(), 0);
14662 __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 180);
14663 __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 270);
14664 __ Fneg(z8.VnS(), p0.Merging(), z8.VnS());
14665 __ Dup(z9.VnS(), 0);
14666 __ Fcmla(z9.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 0);
14667 __ Fcmla(z9.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90);
14668 END();
14669
14670 if (CAN_RUN()) {
14671 RUN();
14672 ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14673 ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14674 ASSERT_EQUAL_SVE(z9.VnS(), z7.VnS());
14675 ASSERT_EQUAL_SVE(z9.VnS(), z8.VnS());
14676 }
14677}
14678
Martyn Capewell75f1c432020-03-30 09:23:27 +010014679TEST_SVE(sve_fcmla) {
14680 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14681 START();
14682
14683 __ Ptrue(p0.VnB());
14684 __ Pfalse(p1.VnB());
14685 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
14686 __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
14687
14688 __ Fdup(z0.VnH(), 10.0);
14689 __ Fdup(z2.VnH(), 2.0);
14690
14691 // Create pairs of complex numbers, Ai + A. A is chosen to be non-zero, as
14692 // the later fneg will result in a failed comparison otherwise.
14693 __ Index(z1.VnH(), -4, 3);
14694 __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14695 __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14696 __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14697
14698 __ Sel(z3.VnH(), p2, z0.VnH(), z1.VnH()); // Ai + 10
14699 __ Sel(z4.VnH(), p2, z1.VnH(), z2.VnH()); // 2i + A
14700
14701 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); // Even complex numbers.
14702 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS()); // Odd complex numbers.
14703
14704 // Calculate (Ai + 10) * (2i + A) = (20 + A^2)i + 8A, using predication to
14705 // select only the complex numbers in odd-numbered element pairs. This leaves
14706 // results in elements 2/3, 6/7, etc. with zero in elements 0/1, 4/5, etc.
14707 // ... 7 6 5 4 3 2 1 0 <-- element
14708 // ... | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | 0 | 0 | <-- value
14709 __ Dup(z5.VnH(), 0);
14710 __ Fcmla(z5.VnH(), p3.Merging(), z4.VnH(), z3.VnH(), 0);
14711 __ Fcmla(z5.VnH(), p3.Merging(), z4.VnH(), z3.VnH(), 90);
14712
14713 // Move the odd results to the even result positions.
14714 // ... 7 6 5 4 3 2 1 0 <-- element
14715 // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
14716 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 4);
14717
14718 // Calculate -(Ai + 10) * (2i + A) = -(20 + A^2)i - 8A for the even complex
14719 // numbers.
14720 // ... 7 6 5 4 3 2 1 0 <-- element
14721 // ... | 0 | 0 | -20-A^2 | -8A | 0 | 0 | -20-A^2 | -8A | <-- value
14722 __ Dup(z6.VnH(), 0);
14723 __ Fcmla(z6.VnH(), p2.Merging(), z4.VnH(), z3.VnH(), 180);
14724 __ Fcmla(z6.VnH(), p2.Merging(), z4.VnH(), z3.VnH(), 270);
14725
14726 // Negate the even results. The results in z6 should now match the results
14727 // computed earlier in z5.
14728 // ... 7 6 5 4 3 2 1 0 <-- element
14729 // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
14730 __ Fneg(z6.VnH(), p2.Merging(), z6.VnH());
14731
14732
14733 // Similarly, but for wider elements.
14734 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14735 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14736 __ Index(z1.VnS(), -4, 3);
14737 __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14738 __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14739 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14740 __ Fdup(z0.VnS(), 20.0);
14741 __ Fdup(z2.VnS(), 21.0);
14742 __ Sel(z3.VnS(), p2, z0.VnS(), z1.VnS());
14743 __ Sel(z4.VnS(), p2, z1.VnS(), z2.VnS());
14744 __ Punpklo(p2.VnH(), p2.VnB());
14745 __ Punpklo(p3.VnH(), p3.VnB());
14746 __ Dup(z7.VnS(), 0);
14747 __ Fcmla(z7.VnS(), p3.Merging(), z4.VnS(), z3.VnS(), 0);
14748 __ Fcmla(z7.VnS(), p3.Merging(), z4.VnS(), z3.VnS(), 90);
14749 __ Ext(z7.VnB(), z7.VnB(), z7.VnB(), 8);
14750 __ Dup(z8.VnS(), 0);
14751 __ Fcmla(z8.VnS(), p2.Merging(), z4.VnS(), z3.VnS(), 180);
14752 __ Fcmla(z8.VnS(), p2.Merging(), z4.VnS(), z3.VnS(), 270);
14753 __ Fneg(z8.VnS(), p2.Merging(), z8.VnS());
14754
14755 // Double precision computed for even lanes only.
14756 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14757 __ Index(z1.VnD(), -4, 3);
14758 __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14759 __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14760 __ Scvtf(z1.VnD(), p0.Merging(), z1.VnD());
14761 __ Fdup(z0.VnD(), 20.0);
14762 __ Fdup(z2.VnD(), 21.0);
14763 __ Sel(z3.VnD(), p2, z0.VnD(), z1.VnD());
14764 __ Sel(z4.VnD(), p2, z1.VnD(), z2.VnD());
14765 __ Punpklo(p2.VnH(), p2.VnB());
14766 __ Dup(z9.VnD(), 0);
14767 __ Fcmla(z9.VnD(), p2.Merging(), z4.VnD(), z3.VnD(), 0);
14768 __ Fcmla(z9.VnD(), p2.Merging(), z4.VnD(), z3.VnD(), 90);
14769 __ Dup(z10.VnD(), 0);
14770 __ Fcmla(z10.VnD(), p2.Merging(), z4.VnD(), z3.VnD(), 180);
14771 __ Fcmla(z10.VnD(), p2.Merging(), z4.VnD(), z3.VnD(), 270);
14772 __ Fneg(z10.VnD(), p2.Merging(), z10.VnD());
14773 END();
14774
14775 if (CAN_RUN()) {
14776 RUN();
14777 ASSERT_EQUAL_SVE(z5.VnH(), z6.VnH());
14778 ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
14779 ASSERT_EQUAL_SVE(z9.VnD(), z10.VnD());
14780 }
14781}
14782
Martyn Capewell46352612020-07-02 15:47:54 +010014783// Create a pattern in dst where the value of each element in src is incremented
14784// by the segment number. This allows varying a short input by a predictable
14785// pattern for each segment.
14786static void FPSegmentPatternHelper(MacroAssembler* masm,
14787 const ZRegister& dst,
14788 const PRegisterM& ptrue,
14789 const ZRegister& src) {
14790 VIXL_ASSERT(AreSameLaneSize(dst, src));
14791 UseScratchRegisterScope temps(masm);
14792 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
14793 masm->Index(ztmp, 0, 1);
14794 masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
14795 masm->Scvtf(ztmp, ptrue, ztmp);
14796 masm->Fadd(dst, src, ztmp);
14797}
14798
Martyn Capewell50e9f552020-01-07 17:45:03 +000014799TEST_SVE(sve_fpmul_index) {
14800 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14801 START();
14802
14803 uint64_t in0[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
14804 uint64_t in1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
14805
Martyn Capewell46352612020-07-02 15:47:54 +010014806 __ Ptrue(p0.VnB());
14807 // Repeat indexed vector across up to 2048-bit VL.
14808 for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i++) {
14809 InsrHelper(&masm, z25.VnD(), in0);
14810 }
Martyn Capewell50e9f552020-01-07 17:45:03 +000014811 InsrHelper(&masm, z1.VnD(), in1);
14812
Martyn Capewell46352612020-07-02 15:47:54 +010014813 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z25.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014814 __ Fmul(z2.VnH(), z1.VnH(), z0.VnH(), 0);
14815 __ Fmul(z3.VnH(), z1.VnH(), z0.VnH(), 1);
14816 __ Fmul(z4.VnH(), z1.VnH(), z0.VnH(), 4);
14817 __ Fmul(z5.VnH(), z1.VnH(), z0.VnH(), 7);
14818
14819 __ Fmul(z6.VnS(), z1.VnS(), z0.VnS(), 0);
14820 __ Fmul(z7.VnS(), z1.VnS(), z0.VnS(), 1);
14821 __ Fmul(z8.VnS(), z1.VnS(), z0.VnS(), 2);
14822 __ Fmul(z9.VnS(), z1.VnS(), z0.VnS(), 3);
14823
14824 __ Fmul(z10.VnD(), z1.VnD(), z0.VnD(), 0);
14825 __ Fmul(z11.VnD(), z1.VnD(), z0.VnD(), 1);
14826
14827 // Compute the results using other instructions.
Martyn Capewell46352612020-07-02 15:47:54 +010014828 __ Dup(z12.VnH(), z25.VnH(), 0);
14829 FPSegmentPatternHelper(&masm, z12.VnH(), p0.Merging(), z12.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014830 __ Fmul(z12.VnH(), z1.VnH(), z12.VnH());
Martyn Capewell46352612020-07-02 15:47:54 +010014831 __ Dup(z13.VnH(), z25.VnH(), 1);
14832 FPSegmentPatternHelper(&masm, z13.VnH(), p0.Merging(), z13.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014833 __ Fmul(z13.VnH(), z1.VnH(), z13.VnH());
Martyn Capewell46352612020-07-02 15:47:54 +010014834 __ Dup(z14.VnH(), z25.VnH(), 4);
14835 FPSegmentPatternHelper(&masm, z14.VnH(), p0.Merging(), z14.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014836 __ Fmul(z14.VnH(), z1.VnH(), z14.VnH());
Martyn Capewell46352612020-07-02 15:47:54 +010014837 __ Dup(z15.VnH(), z25.VnH(), 7);
14838 FPSegmentPatternHelper(&masm, z15.VnH(), p0.Merging(), z15.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014839 __ Fmul(z15.VnH(), z1.VnH(), z15.VnH());
14840
Martyn Capewell46352612020-07-02 15:47:54 +010014841 __ Dup(z16.VnS(), z25.VnS(), 0);
14842 FPSegmentPatternHelper(&masm, z16.VnH(), p0.Merging(), z16.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014843 __ Fmul(z16.VnS(), z1.VnS(), z16.VnS());
Martyn Capewell46352612020-07-02 15:47:54 +010014844 __ Dup(z17.VnS(), z25.VnS(), 1);
14845 FPSegmentPatternHelper(&masm, z17.VnH(), p0.Merging(), z17.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014846 __ Fmul(z17.VnS(), z1.VnS(), z17.VnS());
Martyn Capewell46352612020-07-02 15:47:54 +010014847 __ Dup(z18.VnS(), z25.VnS(), 2);
14848 FPSegmentPatternHelper(&masm, z18.VnH(), p0.Merging(), z18.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014849 __ Fmul(z18.VnS(), z1.VnS(), z18.VnS());
Martyn Capewell46352612020-07-02 15:47:54 +010014850 __ Dup(z19.VnS(), z25.VnS(), 3);
14851 FPSegmentPatternHelper(&masm, z19.VnH(), p0.Merging(), z19.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014852 __ Fmul(z19.VnS(), z1.VnS(), z19.VnS());
14853
Martyn Capewell46352612020-07-02 15:47:54 +010014854 __ Dup(z20.VnD(), z25.VnD(), 0);
14855 FPSegmentPatternHelper(&masm, z20.VnH(), p0.Merging(), z20.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014856 __ Fmul(z20.VnD(), z1.VnD(), z20.VnD());
Martyn Capewell46352612020-07-02 15:47:54 +010014857 __ Dup(z21.VnD(), z25.VnD(), 1);
14858 FPSegmentPatternHelper(&masm, z21.VnH(), p0.Merging(), z21.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014859 __ Fmul(z21.VnD(), z1.VnD(), z21.VnD());
14860
14861 END();
14862
14863 if (CAN_RUN()) {
14864 RUN();
14865 ASSERT_EQUAL_SVE(z12.VnH(), z2.VnH());
14866 ASSERT_EQUAL_SVE(z13.VnH(), z3.VnH());
14867 ASSERT_EQUAL_SVE(z14.VnH(), z4.VnH());
14868 ASSERT_EQUAL_SVE(z15.VnH(), z5.VnH());
14869 ASSERT_EQUAL_SVE(z16.VnS(), z6.VnS());
14870 ASSERT_EQUAL_SVE(z17.VnS(), z7.VnS());
14871 ASSERT_EQUAL_SVE(z18.VnS(), z8.VnS());
14872 ASSERT_EQUAL_SVE(z19.VnS(), z9.VnS());
14873 ASSERT_EQUAL_SVE(z20.VnD(), z10.VnD());
14874 ASSERT_EQUAL_SVE(z21.VnD(), z11.VnD());
14875 }
14876}
14877
Martyn Capewell5fb2ad62020-01-10 14:08:27 +000014878TEST_SVE(sve_ftmad) {
14879 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14880 START();
14881
14882 uint64_t in_h0[] = {0x7c027e01fc02fe01,
14883 0x3c003c00bc00bc00,
14884 0x3c003c00bc00bc00};
14885 uint64_t in_h1[] = {0xfe01fc027e017e01,
14886 0x3c00bc003c00bc00,
14887 0x3c00bc003c00bc00};
14888 uint64_t in_s0[] = {0x7f800002ffc00001,
14889 0x3f8000003f800000,
14890 0xbf800000bf800000};
14891 uint64_t in_s1[] = {0xffc00001ffc00001,
14892 0x3f800000bf800000,
14893 0x3f800000bf800000};
14894 uint64_t in_d0[] = {0x7ff8000000000001,
14895 0x3ff0000000000000,
14896 0xbff0000000000000};
14897 uint64_t in_d1[] = {0xfff0000000000002,
14898 0xbff0000000000000,
14899 0x3ff0000000000000};
14900 InsrHelper(&masm, z0.VnD(), in_h0);
14901 InsrHelper(&masm, z1.VnD(), in_h1);
14902 InsrHelper(&masm, z2.VnD(), in_s0);
14903 InsrHelper(&masm, z3.VnD(), in_s1);
14904 InsrHelper(&masm, z4.VnD(), in_d0);
14905 InsrHelper(&masm, z5.VnD(), in_d1);
14906
14907 __ Mov(z6, z0);
14908 __ Ftmad(z6.VnH(), z6.VnH(), z1.VnH(), 0);
14909 __ Mov(z7, z0);
14910 __ Ftmad(z7.VnH(), z7.VnH(), z1.VnH(), 1);
14911 __ Mov(z8, z0);
14912 __ Ftmad(z8.VnH(), z8.VnH(), z1.VnH(), 2);
14913
14914 __ Mov(z9, z2);
14915 __ Ftmad(z9.VnS(), z9.VnS(), z3.VnS(), 0);
14916 __ Mov(z10, z2);
14917 __ Ftmad(z10.VnS(), z10.VnS(), z3.VnS(), 3);
14918 __ Mov(z11, z2);
14919 __ Ftmad(z11.VnS(), z11.VnS(), z3.VnS(), 4);
14920
14921 __ Mov(z12, z4);
14922 __ Ftmad(z12.VnD(), z12.VnD(), z5.VnD(), 0);
14923 __ Mov(z13, z4);
14924 __ Ftmad(z13.VnD(), z13.VnD(), z5.VnD(), 5);
14925 __ Mov(z14, z4);
14926 __ Ftmad(z14.VnD(), z14.VnD(), z5.VnD(), 7);
14927
14928 END();
14929
14930 if (CAN_RUN()) {
14931 RUN();
14932 uint64_t expected_z6[] = {0x7e027e02fe02fe01,
14933 0x4000400000000000,
14934 0x4000400000000000};
14935 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14936 uint64_t expected_z7[] = {0x7e027e02fe02fe01,
14937 0x3aab3800bcabbe00,
14938 0x3aab3800bcabbe00};
14939 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14940 uint64_t expected_z8[] = {0x7e027e02fe02fe01,
14941 0x3c083c2abbefbbac,
14942 0x3c083c2abbefbbac};
14943 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14944 uint64_t expected_z9[] = {0x7fc00002ffc00001,
14945 0x4000000040000000,
14946 0x0000000000000000};
14947 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14948 uint64_t expected_z10[] = {0x7fc00002ffc00001,
14949 0x3f7ff2ff3f7fa4fc,
14950 0xbf800680bf802d82};
14951 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
14952 uint64_t expected_z11[] = {0x7fc00002ffc00001,
14953 0x3f8000173f8000cd,
14954 0xbf7fffd2bf7ffe66};
14955 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
14956 uint64_t expected_z12[] = {0x7ff8000000000002,
14957 0x4000000000000000,
14958 0x0000000000000000};
14959 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
14960 uint64_t expected_z13[] = {0x7ff8000000000002,
14961 0x3fefffff6c0d846c,
14962 0xbff0000006b978ae};
14963 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
14964 uint64_t expected_z14[] = {0x7ff8000000000002,
14965 0x3feffffffffe708a,
14966 0xbff0000000000000};
14967 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
14968 }
14969}
14970
Martyn Capewell37f28182020-01-14 10:15:10 +000014971static void BasicFPArithHelper(MacroAssembler* masm,
14972 int lane_size_in_bits,
14973 const uint64_t (&inputs)[2],
14974 const uint64_t (&inputs_fmulx)[2],
14975 const uint64_t (&inputs_nans)[2]) {
14976 int ls = lane_size_in_bits;
14977
14978 for (int i = 0; i < 16; i++) {
14979 InsrHelper(masm, z0.VnD(), inputs);
14980 }
14981 ZRegister rvrs = z1.WithLaneSize(ls);
14982 masm->Rev(rvrs, z0.WithLaneSize(ls));
14983
14984 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
14985 Initialise(masm, p2.VnB(), pred);
14986 PRegisterM p2m = p2.Merging();
14987
14988 masm->Mov(z2, z0);
14989 masm->Fadd(z2.WithLaneSize(ls),
14990 p2m,
14991 z2.WithLaneSize(ls),
14992 rvrs,
14993 FastNaNPropagation);
14994 masm->Mov(z3, z0);
14995 masm->Fsub(z3.WithLaneSize(ls), p2m, z3.WithLaneSize(ls), rvrs);
14996 masm->Mov(z4, z0);
14997 masm->Fsub(z4.WithLaneSize(ls), p2m, rvrs, z4.WithLaneSize(ls));
14998 masm->Mov(z5, z0);
14999 masm->Fabd(z5.WithLaneSize(ls),
15000 p2m,
15001 z5.WithLaneSize(ls),
15002 rvrs,
15003 FastNaNPropagation);
15004 masm->Mov(z6, z0);
15005 masm->Fmul(z6.WithLaneSize(ls),
15006 p2m,
15007 z6.WithLaneSize(ls),
15008 rvrs,
15009 FastNaNPropagation);
15010
15011 for (int i = 0; i < 16; i++) {
15012 InsrHelper(masm, z7.VnD(), inputs_fmulx);
15013 }
15014 masm->Rev(z8.WithLaneSize(ls), z7.WithLaneSize(ls));
15015 masm->Fmulx(z7.WithLaneSize(ls),
15016 p2m,
15017 z7.WithLaneSize(ls),
15018 z8.WithLaneSize(ls),
15019 FastNaNPropagation);
15020
15021 InsrHelper(masm, z8.VnD(), inputs_nans);
15022 masm->Mov(z9, z8);
15023 masm->Fminnm(z9.WithLaneSize(ls),
15024 p2m,
15025 z9.WithLaneSize(ls),
15026 rvrs,
15027 FastNaNPropagation);
15028 masm->Mov(z10, z8);
15029 masm->Fmaxnm(z10.WithLaneSize(ls),
15030 p2m,
15031 z10.WithLaneSize(ls),
15032 rvrs,
15033 FastNaNPropagation);
15034}
15035
15036TEST_SVE(sve_fp_arith_pred_h) {
15037 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15038 START();
15039
15040 uint64_t inputs[] = {0x4800470046004500, 0x4400420040003c00};
15041 uint64_t inputs_fmulx[] = {0x7c00fc007c00fc00, 0x0000800000008000};
15042 uint64_t inputs_nans[] = {0x7fffffff7fffffff, 0x7bfffbff7fbbfbff};
15043
15044 BasicFPArithHelper(&masm, kHRegSize, inputs, inputs_fmulx, inputs_nans);
15045
15046 END();
15047
15048 if (CAN_RUN()) {
15049 RUN();
15050 uint64_t expected_z2[] = {0x4880488048804880, 0x4880420048804880};
15051 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15052 uint64_t expected_z3[] = {0x4700450042003c00, 0xbc004200c500c700};
15053 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15054 uint64_t expected_z4[] = {0xc700c500c200bc00, 0x3c00420045004700};
15055 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15056 uint64_t expected_z5[] = {0x4700450042003c00, 0x3c00420045004700};
15057 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15058 uint64_t expected_z6[] = {0x48004b004c804d00, 0x4d0042004b004800};
15059 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15060 uint64_t expected_z7[] = {0xc000c000c000c000, 0xc0008000c000c000};
15061 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15062 uint64_t expected_z9[] = {0x3c00400042004400, 0x4500fbff4700fbff};
15063 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15064 uint64_t expected_z10[] = {0x3c00400042004400, 0x7bfffbff47004800};
15065 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15066 }
15067}
15068
15069TEST_SVE(sve_fp_arith_pred_s) {
15070 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15071 START();
15072
15073 uint64_t inputs[] = {0x4080000040400000, 0x400000003f800000};
15074 uint64_t inputs_fmulx[] = {0x7f800000ff800000, 0x0000000080000000};
15075 uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x41000000c1000000};
15076
15077 BasicFPArithHelper(&masm, kSRegSize, inputs, inputs_fmulx, inputs_nans);
15078
15079 END();
15080
15081 if (CAN_RUN()) {
15082 RUN();
15083 uint64_t expected_z2[] = {0x40a0000040a00000, 0x4000000040a00000};
15084 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15085 uint64_t expected_z3[] = {0x404000003f800000, 0x40000000c0400000};
15086 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15087 uint64_t expected_z4[] = {0xc0400000bf800000, 0x4000000040400000};
15088 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15089 uint64_t expected_z5[] = {0x404000003f800000, 0x4000000040400000};
15090 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15091 uint64_t expected_z6[] = {0x4080000040c00000, 0x4000000040800000};
15092 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15093 uint64_t expected_z7[] = {0xc0000000c0000000, 0x00000000c0000000};
15094 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15095 uint64_t expected_z9[] = {0x3f80000040000000, 0x41000000c1000000};
15096 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15097 uint64_t expected_z10[] = {0x3f80000040000000, 0x4100000040800000};
15098 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15099 }
15100}
15101
15102TEST_SVE(sve_fp_arith_pred_d) {
15103 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15104 START();
15105
15106 uint64_t inputs[] = {0x4000000000000000, 0x3ff0000000000000};
15107 uint64_t inputs_fmulx[] = {0x7ff0000000000000, 0x8000000000000000};
15108 uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x4100000000000000};
15109
15110 BasicFPArithHelper(&masm, kDRegSize, inputs, inputs_fmulx, inputs_nans);
15111
15112 END();
15113
15114 if (CAN_RUN()) {
15115 RUN();
15116 uint64_t expected_z2[] = {0x4008000000000000, 0x4008000000000000};
15117 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15118 uint64_t expected_z3[] = {0x3ff0000000000000, 0xbff0000000000000};
15119 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15120 uint64_t expected_z4[] = {0xbff0000000000000, 0x3ff0000000000000};
15121 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15122 uint64_t expected_z5[] = {0x3ff0000000000000, 0x3ff0000000000000};
15123 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15124 uint64_t expected_z6[] = {0x4000000000000000, 0x4000000000000000};
15125 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15126 uint64_t expected_z7[] = {0xc000000000000000, 0xc000000000000000};
15127 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15128 uint64_t expected_z9[] = {0x3ff0000000000000, 0x4000000000000000};
15129 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15130 uint64_t expected_z10[] = {0x3ff0000000000000, 0x4100000000000000};
15131 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15132 }
15133}
15134
Martyn Capewella2fadc22020-01-16 16:09:55 +000015135TEST_SVE(sve_fp_arith_pred_imm) {
15136 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15137 START();
15138
15139 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
15140 Initialise(&masm, p0.VnB(), pred);
15141 PRegisterM p0m = p0.Merging();
15142 __ Ptrue(p1.VnB());
15143
15144 __ Fdup(z0.VnD(), 0.0);
15145
15146 __ Mov(z1, z0);
15147 __ Fdiv(z1.VnH(), p1.Merging(), z1.VnH(), z1.VnH());
15148 __ Mov(z2, z0);
15149 __ Fadd(z2.VnH(), p0m, z2.VnH(), 0.5);
15150 __ Mov(z3, z2);
15151 __ Fsub(z3.VnH(), p0m, z3.VnH(), 1.0);
15152 __ Mov(z4, z3);
15153 __ Fsub(z4.VnH(), p0m, 1.0, z4.VnH());
15154 __ Mov(z5, z4);
15155 __ Fmul(z5.VnH(), p0m, z5.VnH(), 2.0);
15156 __ Mov(z6, z1);
15157 __ Fminnm(z6.VnH(), p0m, z6.VnH(), 0.0);
15158 __ Mov(z7, z1);
15159 __ Fmaxnm(z7.VnH(), p0m, z7.VnH(), 1.0);
15160 __ Mov(z8, z5);
15161 __ Fmin(z8.VnH(), p0m, z8.VnH(), 1.0);
15162 __ Mov(z9, z5);
15163 __ Fmax(z9.VnH(), p0m, z9.VnH(), 0.0);
15164
15165 __ Mov(z11, z0);
15166 __ Fdiv(z11.VnS(), p1.Merging(), z11.VnS(), z11.VnS());
15167 __ Mov(z12, z0);
15168 __ Fadd(z12.VnS(), p0m, z12.VnS(), 0.5);
15169 __ Mov(z13, z12);
15170 __ Fsub(z13.VnS(), p0m, z13.VnS(), 1.0);
15171 __ Mov(z14, z13);
15172 __ Fsub(z14.VnS(), p0m, 1.0, z14.VnS());
15173 __ Mov(z15, z14);
15174 __ Fmul(z15.VnS(), p0m, z15.VnS(), 2.0);
15175 __ Mov(z16, z11);
15176 __ Fminnm(z16.VnS(), p0m, z16.VnS(), 0.0);
15177 __ Mov(z17, z11);
15178 __ Fmaxnm(z17.VnS(), p0m, z17.VnS(), 1.0);
15179 __ Mov(z18, z15);
15180 __ Fmin(z18.VnS(), p0m, z18.VnS(), 1.0);
15181 __ Mov(z19, z15);
15182 __ Fmax(z19.VnS(), p0m, z19.VnS(), 0.0);
15183
15184 __ Mov(z21, z0);
15185 __ Fdiv(z21.VnD(), p1.Merging(), z21.VnD(), z21.VnD());
15186 __ Mov(z22, z0);
15187 __ Fadd(z22.VnD(), p0m, z22.VnD(), 0.5);
15188 __ Mov(z23, z22);
15189 __ Fsub(z23.VnD(), p0m, z23.VnD(), 1.0);
15190 __ Mov(z24, z23);
15191 __ Fsub(z24.VnD(), p0m, 1.0, z24.VnD());
15192 __ Mov(z25, z24);
15193 __ Fmul(z25.VnD(), p0m, z25.VnD(), 2.0);
15194 __ Mov(z26, z21);
15195 __ Fminnm(z26.VnD(), p0m, z26.VnD(), 0.0);
15196 __ Mov(z27, z21);
15197 __ Fmaxnm(z27.VnD(), p0m, z27.VnD(), 1.0);
15198 __ Mov(z28, z25);
15199 __ Fmin(z28.VnD(), p0m, z28.VnD(), 1.0);
15200 __ Mov(z29, z25);
15201 __ Fmax(z29.VnD(), p0m, z29.VnD(), 0.0);
15202
15203 __ Index(z0.VnH(), -3, 1);
15204 __ Scvtf(z0.VnH(), p1.Merging(), z0.VnH());
15205 __ Fmax(z0.VnH(), p1.Merging(), z0.VnH(), 0.0);
15206 __ Index(z1.VnS(), -4, 2);
15207 __ Scvtf(z1.VnS(), p1.Merging(), z1.VnS());
15208 __ Fadd(z1.VnS(), p1.Merging(), z1.VnS(), 1.0);
15209
15210 END();
15211
15212 if (CAN_RUN()) {
15213 RUN();
15214 uint64_t expected_z2[] = {0x3800380038003800, 0x3800000038003800};
15215 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15216 uint64_t expected_z3[] = {0xb800b800b800b800, 0xb8000000b800b800};
15217 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15218 uint64_t expected_z4[] = {0x3e003e003e003e00, 0x3e0000003e003e00};
15219 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15220 uint64_t expected_z5[] = {0x4200420042004200, 0x4200000042004200};
15221 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15222 uint64_t expected_z6[] = {0x0000000000000000, 0x00007e0000000000};
15223 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15224 uint64_t expected_z7[] = {0x3c003c003c003c00, 0x3c007e003c003c00};
15225 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15226 uint64_t expected_z8[] = {0x3c003c003c003c00, 0x3c0000003c003c00};
15227 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15228 uint64_t expected_z9[] = {0x4200420042004200, 0x4200000042004200};
15229 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15230
15231 uint64_t expected_z12[] = {0x3f0000003f000000, 0x000000003f000000};
15232 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
15233 uint64_t expected_z13[] = {0xbf000000bf000000, 0x00000000bf000000};
15234 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
15235 uint64_t expected_z14[] = {0x3fc000003fc00000, 0x000000003fc00000};
15236 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
15237 uint64_t expected_z15[] = {0x4040000040400000, 0x0000000040400000};
15238 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
15239 uint64_t expected_z16[] = {0x0000000000000000, 0x7fc0000000000000};
15240 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
15241 uint64_t expected_z17[] = {0x3f8000003f800000, 0x7fc000003f800000};
15242 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
15243 uint64_t expected_z18[] = {0x3f8000003f800000, 0x000000003f800000};
15244 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
15245 uint64_t expected_z19[] = {0x4040000040400000, 0x0000000040400000};
15246 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
15247
15248 uint64_t expected_z22[] = {0x3fe0000000000000, 0x3fe0000000000000};
15249 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
15250 uint64_t expected_z23[] = {0xbfe0000000000000, 0xbfe0000000000000};
15251 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
15252 uint64_t expected_z24[] = {0x3ff8000000000000, 0x3ff8000000000000};
15253 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
15254 uint64_t expected_z25[] = {0x4008000000000000, 0x4008000000000000};
15255 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
15256 uint64_t expected_z26[] = {0x0000000000000000, 0x0000000000000000};
15257 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
15258 uint64_t expected_z27[] = {0x3ff0000000000000, 0x3ff0000000000000};
15259 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
15260 uint64_t expected_z28[] = {0x3ff0000000000000, 0x3ff0000000000000};
15261 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
15262 uint64_t expected_z29[] = {0x4008000000000000, 0x4008000000000000};
15263 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
15264 uint64_t expected_z0[] = {0x4400420040003c00, 0x0000000000000000};
15265 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
15266 uint64_t expected_z1[] = {0x404000003f800000, 0xbf800000c0400000};
15267 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
15268 }
15269}
15270
Martyn Capewell37f28182020-01-14 10:15:10 +000015271TEST_SVE(sve_fscale) {
15272 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15273 START();
15274
15275 uint64_t inputs_h[] = {0x4800470046004500, 0x4400420040003c00};
15276 InsrHelper(&masm, z0.VnD(), inputs_h);
15277 uint64_t inputs_s[] = {0x4080000040400000, 0x400000003f800000};
15278 InsrHelper(&masm, z1.VnD(), inputs_s);
15279 uint64_t inputs_d[] = {0x40f0000000000000, 0x4000000000000000};
15280 InsrHelper(&masm, z2.VnD(), inputs_d);
15281
15282 uint64_t scales[] = {0x00080002fff8fffe, 0x00100001fff0ffff};
15283 InsrHelper(&masm, z3.VnD(), scales);
15284
15285 __ Ptrue(p0.VnB());
15286 int pred[] = {0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1};
15287 Initialise(&masm, p1.VnB(), pred);
15288
15289 __ Mov(z4, z0);
15290 __ Fscale(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH());
15291 __ Mov(z5, z0);
15292 __ Fscale(z5.VnH(), p1.Merging(), z5.VnH(), z3.VnH());
15293
15294 __ Sunpklo(z3.VnS(), z3.VnH());
15295 __ Mov(z6, z1);
15296 __ Fscale(z6.VnS(), p0.Merging(), z6.VnS(), z3.VnS());
15297 __ Mov(z7, z1);
15298 __ Fscale(z7.VnS(), p1.Merging(), z7.VnS(), z3.VnS());
15299
15300 __ Sunpklo(z3.VnD(), z3.VnS());
15301 __ Mov(z8, z2);
15302 __ Fscale(z8.VnD(), p0.Merging(), z8.VnD(), z3.VnD());
15303 __ Mov(z9, z2);
15304 __ Fscale(z9.VnD(), p1.Merging(), z9.VnD(), z3.VnD());
15305
15306 // Test full double precision range scaling.
15307 __ Dup(z10.VnD(), 2045);
15308 __ Dup(z11.VnD(), 0x0010000000000000); // 2^-1022
15309 __ Fscale(z11.VnD(), p0.Merging(), z11.VnD(), z10.VnD());
15310
15311 END();
15312
15313 if (CAN_RUN()) {
15314 RUN();
15315
15316 uint64_t expected_z4[] = {0x68004f0026003d00, 0x7c00460002003800};
15317 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15318 uint64_t expected_z5[] = {0x68004f0026004500, 0x7c00420002003800};
15319 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15320
15321 uint64_t expected_z6[] = {0x4880000040c00000, 0x380000003f000000};
15322 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15323 uint64_t expected_z7[] = {0x4880000040400000, 0x400000003f000000};
15324 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15325
15326 uint64_t expected_z8[] = {0x3ff0000000000000, 0x3ff0000000000000};
15327 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15328 uint64_t expected_z9[] = {0x40f0000000000000, 0x3ff0000000000000};
15329 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15330
15331 uint64_t expected_z11[] = {0x7fe0000000000000, 0x7fe0000000000000};
15332 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
15333 }
15334}
15335
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015336typedef void (MacroAssembler::*FcvtFrintMFn)(const ZRegister& zd,
15337 const PRegisterM& pg,
15338 const ZRegister& zn);
15339
15340typedef void (MacroAssembler::*FcvtFrintZFn)(const ZRegister& zd,
15341 const PRegisterZ& pg,
15342 const ZRegister& zn);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015343
15344template <typename F, size_t N>
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015345static void TestFcvtFrintHelper(Test* config,
15346 FcvtFrintMFn macro_m,
15347 FcvtFrintZFn macro_z,
15348 int dst_type_size_in_bits,
15349 int src_type_size_in_bits,
15350 const F (&zn_inputs)[N],
15351 const int (&pg_inputs)[N],
15352 const uint64_t (&zd_expected_all_active)[N]) {
15353 VIXL_ASSERT(macro_m != NULL);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015354 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15355 START();
15356
15357 // If the input and result types have a different size, the instruction
15358 // options on elements of the largest specified type is determined by the
15359 // larger type.
15360 int lane_size_in_bits =
15361 std::max(dst_type_size_in_bits, src_type_size_in_bits);
15362
15363 ZRegister zd_all_active = z25;
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015364 ZRegister zd_merging = z26;
TatWai Chongdb7437c2020-01-09 17:44:10 -080015365 ZRegister zn = z27;
15366
15367 uint64_t zn_rawbits[N];
15368 FPToRawbitsWithSize(zn_inputs, zn_rawbits, src_type_size_in_bits);
15369 InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_rawbits);
15370
15371 PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15372 __ Ptrue(pg_all_active);
15373
15374 // Test floating-point conversions with all lanes actived.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015375 (masm.*macro_m)(zd_all_active.WithLaneSize(dst_type_size_in_bits),
15376 pg_all_active.Merging(),
15377 zn.WithLaneSize(src_type_size_in_bits));
TatWai Chongdb7437c2020-01-09 17:44:10 -080015378
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015379 PRegisterWithLaneSize pg_merging = p1.WithLaneSize(lane_size_in_bits);
15380 Initialise(&masm, pg_merging, pg_inputs);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015381
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015382 __ Dup(zd_merging.VnD(), 0x0bad0bad0bad0bad);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015383
15384 // Use the same `zn` inputs to test floating-point conversions but partial
15385 // lanes are set inactive.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015386 (masm.*macro_m)(zd_merging.WithLaneSize(dst_type_size_in_bits),
15387 pg_merging.Merging(),
15388 zn.WithLaneSize(src_type_size_in_bits));
15389
15390 ZRegister zd_zeroing = z24;
15391 PRegisterWithLaneSize pg_zeroing = p1.WithLaneSize(lane_size_in_bits);
15392 Initialise(&masm, pg_zeroing, pg_inputs);
15393
15394 if (macro_z != NULL) {
15395 __ Dup(zd_zeroing.VnD(), 0x0bad0bad0bad0bad);
15396 (masm.*macro_z)(zd_zeroing.WithLaneSize(dst_type_size_in_bits),
15397 pg_zeroing.Zeroing(),
15398 zn.WithLaneSize(src_type_size_in_bits));
15399 }
TatWai Chongdb7437c2020-01-09 17:44:10 -080015400
15401 END();
15402
15403 if (CAN_RUN()) {
15404 RUN();
15405
15406 ASSERT_EQUAL_SVE(zd_expected_all_active,
15407 zd_all_active.WithLaneSize(lane_size_in_bits));
15408
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015409 uint64_t zd_expected_merging[N];
TatWai Chongdb7437c2020-01-09 17:44:10 -080015410 for (unsigned i = 0; i < N; i++) {
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015411 zd_expected_merging[i] =
TatWai Chongdb7437c2020-01-09 17:44:10 -080015412 pg_inputs[i] ? zd_expected_all_active[i]
15413 : 0x0bad0bad0bad0bad & GetUintMask(lane_size_in_bits);
15414 }
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015415 ASSERT_EQUAL_SVE(zd_expected_merging,
15416 zd_merging.WithLaneSize(lane_size_in_bits));
15417
15418 if (macro_z != NULL) {
15419 uint64_t zd_expected_zeroing[N] = {0};
15420 for (unsigned i = 0; i < N; i++) {
15421 if (pg_inputs[i]) {
15422 zd_expected_zeroing[i] = zd_expected_all_active[i];
15423 }
15424 }
15425 ASSERT_EQUAL_SVE(zd_expected_zeroing,
15426 zd_zeroing.WithLaneSize(lane_size_in_bits));
15427 }
TatWai Chongdb7437c2020-01-09 17:44:10 -080015428 }
15429}
15430
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015431template <typename F, size_t N>
15432static void TestFcvtzHelper(Test* config,
15433 FcvtFrintMFn macro_m,
15434 int dst_type_size_in_bits,
15435 int src_type_size_in_bits,
15436 const F (&zn_inputs)[N],
15437 const int (&pg_inputs)[N],
15438 const uint64_t (&zd_expected_all_active)[N]) {
15439 TestFcvtFrintHelper(config,
15440 macro_m,
15441 // Fcvt variants have no zeroing predication form.
15442 NULL,
15443 dst_type_size_in_bits,
15444 src_type_size_in_bits,
15445 zn_inputs,
15446 pg_inputs,
15447 zd_expected_all_active);
15448}
15449
TatWai Chongdb7437c2020-01-09 17:44:10 -080015450TEST_SVE(fcvtzs_fcvtzu_float16) {
TatWai Chongdb7437c2020-01-09 17:44:10 -080015451 const double h_max_float16 = kHMaxInt; // Largest float16 == INT16_MAX.
15452 const double h_min_float16 = -h_max_float16; // Smallest float16 > INT16_MIN.
15453 const double largest_float16 = 0xffe0; // 65504
15454 const double smallest_float16 = -largest_float16;
15455 const double h_max_int_sub_one = kHMaxInt - 1;
15456 const double h_min_int_add_one = kHMinInt + 1;
15457
15458 double zn_inputs[] = {1.0,
15459 1.1,
15460 1.5,
15461 -1.5,
15462 h_max_float16,
15463 h_min_float16,
15464 largest_float16,
15465 smallest_float16,
15466 kFP64PositiveInfinity,
15467 kFP64NegativeInfinity,
15468 h_max_int_sub_one,
15469 h_min_int_add_one};
15470
15471 int pg_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
15472
15473 uint64_t expected_fcvtzs_fp162h[] = {1,
15474 1,
15475 1,
15476 0xffff,
15477 0x7fff,
15478 0x8000,
15479 0x7fff,
15480 0x8000,
15481 0x7fff,
15482 0x8000,
15483 0x7fff,
15484 0x8000};
15485
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015486 uint64_t expected_fcvtzu_fp162h[] =
15487 {1, 1, 1, 0, 0x8000, 0, 0xffe0, 0, 0xffff, 0, 0x8000, 0};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015488
15489 // Float16 to 16-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015490 TestFcvtzHelper(config,
15491 &MacroAssembler::Fcvtzs,
15492 kHRegSize,
15493 kHRegSize,
15494 zn_inputs,
15495 pg_inputs,
15496 expected_fcvtzs_fp162h);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015497
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015498 TestFcvtzHelper(config,
15499 &MacroAssembler::Fcvtzu,
15500 kHRegSize,
15501 kHRegSize,
15502 zn_inputs,
15503 pg_inputs,
15504 expected_fcvtzu_fp162h);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015505
15506 uint64_t expected_fcvtzs_fp162w[] = {1,
15507 1,
15508 1,
15509 0xffffffff,
15510 0x8000,
15511 0xffff8000,
15512 0xffe0,
15513 0xffff0020,
15514 0x7fffffff,
15515 0x80000000,
15516 0x8000,
15517 0xffff8000};
15518
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015519 uint64_t expected_fcvtzu_fp162w[] =
15520 {1, 1, 1, 0, 0x8000, 0, 0xffe0, 0, 0xffffffff, 0, 0x8000, 0};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015521
15522 // Float16 to 32-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015523 TestFcvtzHelper(config,
15524 &MacroAssembler::Fcvtzs,
15525 kSRegSize,
15526 kHRegSize,
15527 zn_inputs,
15528 pg_inputs,
15529 expected_fcvtzs_fp162w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015530
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015531 TestFcvtzHelper(config,
15532 &MacroAssembler::Fcvtzu,
15533 kSRegSize,
15534 kHRegSize,
15535 zn_inputs,
15536 pg_inputs,
15537 expected_fcvtzu_fp162w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015538
15539 uint64_t expected_fcvtzs_fp162x[] = {1,
15540 1,
15541 1,
15542 0xffffffffffffffff,
15543 0x8000,
15544 0xffffffffffff8000,
15545 0xffe0,
15546 0xffffffffffff0020,
15547 0x7fffffffffffffff,
15548 0x8000000000000000,
15549 0x8000,
15550 0xffffffffffff8000};
15551
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015552 uint64_t expected_fcvtzu_fp162x[] =
15553 {1, 1, 1, 0, 0x8000, 0, 0xffe0, 0, 0xffffffffffffffff, 0, 0x8000, 0};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015554
15555 // Float16 to 64-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015556 TestFcvtzHelper(config,
15557 &MacroAssembler::Fcvtzs,
15558 kDRegSize,
15559 kHRegSize,
15560 zn_inputs,
15561 pg_inputs,
15562 expected_fcvtzs_fp162x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015563
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015564 TestFcvtzHelper(config,
15565 &MacroAssembler::Fcvtzu,
15566 kDRegSize,
15567 kHRegSize,
15568 zn_inputs,
15569 pg_inputs,
15570 expected_fcvtzu_fp162x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015571}
15572
15573TEST_SVE(fcvtzs_fcvtzu_float) {
15574 const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
15575 const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
15576 const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
15577 const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
15578 const double w_max_int_sub_one = kWMaxInt - 1;
15579 const double w_min_int_add_one = kWMinInt + 1;
15580 const double x_max_int_sub_one = kXMaxInt - 1;
15581 const double x_min_int_add_one = kXMinInt + 1;
15582
TatWai Chongdb7437c2020-01-09 17:44:10 -080015583 double zn_inputs[] = {1.0,
15584 1.1,
15585 1.5,
15586 -1.5,
15587 w_max_float,
15588 w_min_float,
15589 x_max_float,
15590 x_min_float,
15591 kFP64PositiveInfinity,
15592 kFP64NegativeInfinity,
15593 w_max_int_sub_one,
15594 w_min_int_add_one,
15595 x_max_int_sub_one,
15596 x_min_int_add_one};
15597
15598 int pg_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0};
15599
15600 uint64_t expected_fcvtzs_s2w[] = {1,
15601 1,
15602 1,
15603 0xffffffff,
15604 0x7fffff80,
15605 0x80000080,
15606 0x7fffffff,
15607 0x80000000,
15608 0x7fffffff,
15609 0x80000000,
15610 0x7fffffff,
15611 0x80000000,
15612 0x7fffffff,
15613 0x80000000};
15614
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015615 uint64_t expected_fcvtzu_s2w[] = {1,
15616 1,
15617 1,
15618 0,
15619 0x7fffff80,
15620 0,
15621 0xffffffff,
15622 0,
15623 0xffffffff,
15624 0,
15625 0x80000000,
15626 0,
15627 0xffffffff,
15628 0};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015629
15630 // Float to 32-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015631 TestFcvtzHelper(config,
15632 &MacroAssembler::Fcvtzs,
15633 kSRegSize,
15634 kSRegSize,
15635 zn_inputs,
15636 pg_inputs,
15637 expected_fcvtzs_s2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015638
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015639 TestFcvtzHelper(config,
15640 &MacroAssembler::Fcvtzu,
15641 kSRegSize,
15642 kSRegSize,
15643 zn_inputs,
15644 pg_inputs,
15645 expected_fcvtzu_s2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015646
15647 uint64_t expected_fcvtzs_s2x[] = {1,
15648 1,
15649 1,
15650 0xffffffffffffffff,
15651 0x7fffff80,
15652 0xffffffff80000080,
15653 0x7fffff8000000000,
15654 0x8000008000000000,
15655 0x7fffffffffffffff,
15656 0x8000000000000000,
15657 0x80000000,
15658 0xffffffff80000000,
15659 0x7fffffffffffffff,
15660 0x8000000000000000};
15661
15662 uint64_t expected_fcvtzu_s2x[] = {1,
15663 1,
15664 1,
15665 0,
15666 0x7fffff80,
15667 0,
15668 0x7fffff8000000000,
15669 0,
15670 0xffffffffffffffff,
15671 0,
15672 0x0000000080000000,
15673 0,
15674 0x8000000000000000,
15675 0};
15676
15677 // Float to 64-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015678 TestFcvtzHelper(config,
15679 &MacroAssembler::Fcvtzs,
15680 kDRegSize,
15681 kSRegSize,
15682 zn_inputs,
15683 pg_inputs,
15684 expected_fcvtzs_s2x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015685
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015686 TestFcvtzHelper(config,
15687 &MacroAssembler::Fcvtzu,
15688 kDRegSize,
15689 kSRegSize,
15690 zn_inputs,
15691 pg_inputs,
15692 expected_fcvtzu_s2x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015693}
15694
15695TEST_SVE(fcvtzs_fcvtzu_double) {
TatWai Chongdb7437c2020-01-09 17:44:10 -080015696 const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
15697 const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
15698 const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
15699 const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015700 const double w_max_double = kWMaxInt; // Largest double == INT32_MAX.
15701 const double w_min_double = -w_max_double; // Smallest double > INT32_MIN.
15702 const double x_max_double =
15703 0x7ffffffffffffc00; // Largest double < INT64_MAX.
15704 const double x_min_double = -x_max_double; // Smallest double > INT64_MIN.
TatWai Chongdb7437c2020-01-09 17:44:10 -080015705 const double w_max_int_sub_one = kWMaxInt - 1;
15706 const double w_min_int_add_one = kWMinInt + 1;
15707 const double x_max_int_sub_one = kXMaxInt - 1;
15708 const double x_min_int_add_one = kXMinInt + 1;
15709
15710 double zn_inputs[] = {1.0,
15711 1.1,
15712 1.5,
15713 -1.5,
15714 w_max_float,
15715 w_min_float,
15716 x_max_float,
15717 x_min_float,
15718 w_max_double,
15719 w_min_double,
15720 x_max_double,
15721 x_min_double,
15722 kFP64PositiveInfinity,
15723 kFP64NegativeInfinity,
15724 w_max_int_sub_one,
15725 w_min_int_add_one,
15726 x_max_int_sub_one,
15727 x_min_int_add_one};
15728
15729 int pg_inputs[] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0};
15730
15731 uint64_t expected_fcvtzs_d2w[] = {1,
15732 1,
15733 1,
15734 0xffffffffffffffff,
15735 0x7fffff80,
15736 0xffffffff80000080,
15737 0x7fffffff,
15738 0xffffffff80000000,
15739 0x7fffffff,
15740 0xffffffff80000001,
15741 0x7fffffff,
15742 0xffffffff80000000,
15743 0x7fffffff,
15744 0xffffffff80000000,
15745 0x7ffffffe,
15746 0xffffffff80000001,
15747 0x7fffffff,
15748 0xffffffff80000000};
15749
15750 uint64_t expected_fcvtzu_d2w[] = {1,
15751 1,
15752 1,
15753 0,
15754 0x7fffff80,
15755 0,
15756 0xffffffff,
15757 0,
15758 0x7fffffff,
15759 0,
15760 0xffffffff,
15761 0,
15762 0xffffffff,
15763 0,
15764 0x7ffffffe,
15765 0,
15766 0xffffffff,
15767 0};
15768
15769 // Double to 32-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015770 TestFcvtzHelper(config,
15771 &MacroAssembler::Fcvtzs,
15772 kSRegSize,
15773 kDRegSize,
15774 zn_inputs,
15775 pg_inputs,
15776 expected_fcvtzs_d2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015777
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015778 TestFcvtzHelper(config,
15779 &MacroAssembler::Fcvtzu,
15780 kSRegSize,
15781 kDRegSize,
15782 zn_inputs,
15783 pg_inputs,
15784 expected_fcvtzu_d2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015785
15786 uint64_t expected_fcvtzs_d2x[] = {1,
15787 1,
15788 1,
15789 0xffffffffffffffff,
15790 0x7fffff80,
15791 0xffffffff80000080,
15792 0x7fffff8000000000,
15793 0x8000008000000000,
15794 0x7fffffff,
15795 0xffffffff80000001,
15796 0x7ffffffffffffc00,
15797 0x8000000000000400,
15798 0x7fffffffffffffff,
15799 0x8000000000000000,
15800 0x7ffffffe,
15801 0xffffffff80000001,
15802 0x7fffffffffffffff,
15803 0x8000000000000000};
15804
15805 uint64_t expected_fcvtzu_d2x[] = {1,
15806 1,
15807 1,
15808 0,
15809 0x7fffff80,
15810 0,
15811 0x7fffff8000000000,
15812 0,
15813 0x7fffffff,
15814 0,
15815 0x7ffffffffffffc00,
15816 0,
15817 0xffffffffffffffff,
15818 0,
15819 0x000000007ffffffe,
15820 0,
15821 0x8000000000000000,
15822 0};
15823
15824 // Double to 64-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015825 TestFcvtzHelper(config,
15826 &MacroAssembler::Fcvtzs,
15827 kDRegSize,
15828 kDRegSize,
15829 zn_inputs,
15830 pg_inputs,
15831 expected_fcvtzs_d2x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015832
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015833 TestFcvtzHelper(config,
15834 &MacroAssembler::Fcvtzu,
15835 kDRegSize,
15836 kDRegSize,
15837 zn_inputs,
15838 pg_inputs,
15839 expected_fcvtzu_d2x);
15840}
15841
15842template <typename F, size_t N>
15843static void TestFrintHelper(Test* config,
15844 FcvtFrintMFn macro_m,
15845 FcvtFrintZFn macro_z,
15846 int lane_size_in_bits,
15847 const F (&zn_inputs)[N],
15848 const int (&pg_inputs)[N],
15849 const F (&zd_expected)[N]) {
15850 uint64_t zd_expected_rawbits[N];
15851 FPToRawbitsWithSize(zd_expected, zd_expected_rawbits, lane_size_in_bits);
15852 TestFcvtFrintHelper(config,
15853 macro_m,
15854 macro_z,
15855 lane_size_in_bits,
15856 lane_size_in_bits,
15857 zn_inputs,
15858 pg_inputs,
15859 zd_expected_rawbits);
15860}
15861
15862TEST_SVE(frint) {
15863 const double inf_pos = kFP64PositiveInfinity;
15864 const double inf_neg = kFP64NegativeInfinity;
15865
15866 double zn_inputs[] =
15867 {1.1, 1.5, 1.9, 2.5, -1.5, -2.5, 0.0, -0.0, -0.2, inf_pos, inf_neg};
15868 double zd_expected_a[] =
15869 {1.0, 2.0, 2.0, 3.0, -2.0, -3.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15870 double zd_expected_i[] =
15871 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15872 double zd_expected_m[] =
15873 {1.0, 1.0, 1.0, 2.0, -2.0, -3.0, 0.0, -0.0, -1.0, inf_pos, inf_neg};
15874 double zd_expected_n[] =
15875 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15876 double zd_expected_p[] =
15877 {2.0, 2.0, 2.0, 3.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15878 double zd_expected_x[] =
15879 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15880 double zd_expected_z[] =
15881 {1.0, 1.0, 1.0, 2.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15882
15883 int pg_inputs[] = {0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0};
15884
15885 struct TestDataSet {
15886 FcvtFrintMFn macro_m; // merging form.
15887 FcvtFrintZFn macro_z; // zeroing form.
15888 double (&expected)[11];
15889 };
15890
15891 TestDataSet test_data[] =
15892 {{&MacroAssembler::Frinta, &MacroAssembler::Frinta, zd_expected_a},
15893 {&MacroAssembler::Frinti, &MacroAssembler::Frinti, zd_expected_i},
15894 {&MacroAssembler::Frintm, &MacroAssembler::Frintm, zd_expected_m},
15895 {&MacroAssembler::Frintn, &MacroAssembler::Frintn, zd_expected_n},
15896 {&MacroAssembler::Frintp, &MacroAssembler::Frintp, zd_expected_p},
15897 {&MacroAssembler::Frintx, &MacroAssembler::Frintx, zd_expected_x},
15898 {&MacroAssembler::Frintz, &MacroAssembler::Frintz, zd_expected_z}};
15899
15900 unsigned lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
15901
15902 for (size_t i = 0; i < sizeof(test_data) / sizeof(TestDataSet); i++) {
15903 for (size_t j = 0; j < ArrayLength(lane_sizes); j++) {
15904 TestFrintHelper(config,
15905 test_data[i].macro_m,
15906 test_data[i].macro_z,
15907 lane_sizes[j],
15908 zn_inputs,
15909 pg_inputs,
15910 test_data[i].expected);
15911 }
15912 }
TatWai Chongdb7437c2020-01-09 17:44:10 -080015913}
15914
TatWai Chong31cd6a02020-01-10 13:03:26 -080015915struct CvtfTestDataSet {
15916 uint64_t int_value;
15917 uint64_t scvtf_result;
15918 uint64_t ucvtf_result;
15919};
15920
15921template <size_t N>
15922static void TestUScvtfHelper(Test* config,
15923 int dst_type_size_in_bits,
15924 int src_type_size_in_bits,
15925 const int (&pg_inputs)[N],
15926 const CvtfTestDataSet (&data_set)[N]) {
15927 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15928 START();
15929
15930 // Unpack the data from the array of struct into individual arrays that can
15931 // simplify the testing.
15932 uint64_t zn_inputs[N];
15933 uint64_t expected_zd_scvtf_all_active[N];
15934 uint64_t expected_zd_ucvtf_all_active[N];
15935 for (size_t i = 0; i < N; i++) {
15936 zn_inputs[i] = data_set[i].int_value;
15937 expected_zd_scvtf_all_active[i] = data_set[i].scvtf_result;
15938 expected_zd_ucvtf_all_active[i] = data_set[i].ucvtf_result;
15939 }
15940
15941 // If the input and result types have a different size, the instruction
15942 // operates on elements of the largest specified type.
15943 int lane_size_in_bits =
15944 std::max(dst_type_size_in_bits, src_type_size_in_bits);
15945
15946 ZRegister zd_scvtf_all_active = z25;
15947 ZRegister zd_ucvtf_all_active = z26;
15948 ZRegister zn = z27;
15949 InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_inputs);
15950
15951 PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15952 __ Ptrue(pg_all_active);
15953
15954 // Test integer conversions with all lanes actived.
15955 __ Scvtf(zd_scvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15956 pg_all_active.Merging(),
15957 zn.WithLaneSize(src_type_size_in_bits));
15958 __ Ucvtf(zd_ucvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15959 pg_all_active.Merging(),
15960 zn.WithLaneSize(src_type_size_in_bits));
15961
15962 ZRegister zd_scvtf_merged = z23;
15963 ZRegister zd_ucvtf_merged = z24;
15964
15965 PRegisterWithLaneSize pg_merged = p1.WithLaneSize(lane_size_in_bits);
15966 Initialise(&masm, pg_merged, pg_inputs);
15967
15968 uint64_t snan;
15969 switch (lane_size_in_bits) {
15970 case kHRegSize:
15971 snan = 0x7c11;
15972 break;
15973 case kSRegSize:
15974 snan = 0x7f951111;
15975 break;
15976 case kDRegSize:
15977 snan = 0x7ff5555511111111;
15978 break;
15979 }
15980 __ Dup(zd_scvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15981 __ Dup(zd_ucvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15982
15983 // Use the same `zn` inputs to test integer conversions but some lanes are set
15984 // inactive.
15985 __ Scvtf(zd_scvtf_merged.WithLaneSize(dst_type_size_in_bits),
15986 pg_merged.Merging(),
15987 zn.WithLaneSize(src_type_size_in_bits));
15988 __ Ucvtf(zd_ucvtf_merged.WithLaneSize(dst_type_size_in_bits),
15989 pg_merged.Merging(),
15990 zn.WithLaneSize(src_type_size_in_bits));
15991
15992 END();
15993
15994 if (CAN_RUN()) {
15995 RUN();
15996
15997 ASSERT_EQUAL_SVE(expected_zd_scvtf_all_active,
15998 zd_scvtf_all_active.WithLaneSize(lane_size_in_bits));
15999 ASSERT_EQUAL_SVE(expected_zd_ucvtf_all_active,
16000 zd_ucvtf_all_active.WithLaneSize(lane_size_in_bits));
16001
16002 uint64_t expected_zd_scvtf_merged[N];
16003 for (size_t i = 0; i < N; i++) {
16004 expected_zd_scvtf_merged[i] =
16005 pg_inputs[i] ? expected_zd_scvtf_all_active[i] : snan;
16006 }
16007 ASSERT_EQUAL_SVE(expected_zd_scvtf_merged,
16008 zd_scvtf_merged.WithLaneSize(lane_size_in_bits));
16009
16010 uint64_t expected_zd_ucvtf_merged[N];
16011 for (size_t i = 0; i < N; i++) {
16012 expected_zd_ucvtf_merged[i] =
16013 pg_inputs[i] ? expected_zd_ucvtf_all_active[i] : snan;
16014 }
16015 ASSERT_EQUAL_SVE(expected_zd_ucvtf_merged,
16016 zd_ucvtf_merged.WithLaneSize(lane_size_in_bits));
16017 }
16018}
16019
16020TEST_SVE(scvtf_ucvtf_h_s_d_to_float16) {
16021 // clang-format off
16022 CvtfTestDataSet data_set_1[] = {
16023 // Simple conversions of positive numbers which require no rounding; the
16024 // results should not depened on the rounding mode, and ucvtf and scvtf should
16025 // produce the same result.
16026 {0x0000, 0x0000, 0x0000},
16027 {0x0001, 0x3c00, 0x3c00},
16028 {0x0010, 0x4c00, 0x4c00},
16029 {0x0080, 0x5800, 0x5800},
16030 {0x0400, 0x6400, 0x6400},
16031 // Conversions which require rounding.
16032 {0x4000, 0x7400, 0x7400},
16033 {0x4001, 0x7400, 0x7400},
16034 // Round up to produce a result that's too big for the input to represent.
16035 {0x7ff0, 0x77ff, 0x77ff},
16036 {0x7ff1, 0x77ff, 0x77ff},
16037 {0x7ffe, 0x7800, 0x7800},
16038 {0x7fff, 0x7800, 0x7800}};
16039 int pg_1[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
16040 TestUScvtfHelper(config, kHRegSize, kDRegSize, pg_1, data_set_1);
16041 TestUScvtfHelper(config, kHRegSize, kSRegSize, pg_1, data_set_1);
16042 TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_1, data_set_1);
16043
16044 CvtfTestDataSet data_set_2[] = {
16045 // Test mantissa extremities.
16046 {0x0401, 0x6401, 0x6401},
16047 {0x4020, 0x7402, 0x7402},
16048 // The largest int16_t that fits in a float16.
16049 {0xffef, 0xcc40, 0x7bff},
16050 // Values that would be negative if treated as an int16_t.
16051 {0xff00, 0xdc00, 0x7bf8},
16052 {0x8000, 0xf800, 0x7800},
16053 {0x8100, 0xf7f0, 0x7808},
16054 // Check for bit pattern reproduction.
16055 {0x0123, 0x5c8c, 0x5c8c},
16056 {0x0cde, 0x6a6f, 0x6a6f},
16057 // Simple conversions of negative int64_t values. These require no rounding,
16058 // and the results should not depend on the rounding mode.
16059 {0xf800, 0xe800, 0x7bc0},
16060 {0xfc00, 0xe400, 0x7be0},
16061 {0xc000, 0xf400, 0x7a00},
16062 // Check rounding of negative int16_t values.
16063 {0x8ffe, 0xf700, 0x7880},
16064 {0x8fff, 0xf700, 0x7880},
16065 {0xffee, 0xcc80, 0x7bff},
16066 {0xffef, 0xcc40, 0x7bff}};
16067 int pg_2[] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1};
16068 // `32-bit to float16` and `64-bit to float16` of above tests has been tested
16069 // in `ucvtf` of `16-bit to float16`.
16070 TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_2, data_set_2);
16071 // clang-format on
16072}
16073
16074TEST_SVE(scvtf_ucvtf_s_to_float) {
16075 // clang-format off
16076 int dst_lane_size = kSRegSize;
16077 int src_lane_size = kSRegSize;
16078
16079 // Simple conversions of positive numbers which require no rounding; the
16080 // results should not depened on the rounding mode, and ucvtf and scvtf should
16081 // produce the same result.
16082 CvtfTestDataSet data_set_1[] = {
16083 {0x00000000, 0x00000000, 0x00000000},
16084 {0x00000001, 0x3f800000, 0x3f800000},
16085 {0x00004000, 0x46800000, 0x46800000},
16086 {0x00010000, 0x47800000, 0x47800000},
16087 {0x40000000, 0x4e800000, 0x4e800000}};
16088 int pg_1[] = {1, 0, 1, 0, 0};
16089 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16090
16091 CvtfTestDataSet data_set_2[] = {
16092 // Test mantissa extremities.
16093 {0x00800001, 0x4b000001, 0x4b000001},
16094 {0x40400000, 0x4e808000, 0x4e808000},
16095 // The largest int32_t that fits in a double.
16096 {0x7fffff80, 0x4effffff, 0x4effffff},
16097 // Values that would be negative if treated as an int32_t.
16098 {0xffffffff, 0xbf800000, 0x4f800000},
16099 {0xffffff00, 0xc3800000, 0x4f7fffff},
16100 {0x80000000, 0xcf000000, 0x4f000000},
16101 {0x80000001, 0xcf000000, 0x4f000000},
16102 // Check for bit pattern reproduction.
16103 {0x089abcde, 0x4d09abce, 0x4d09abce},
16104 {0x12345678, 0x4d91a2b4, 0x4d91a2b4}};
16105 int pg_2[] = {1, 0, 1, 0, 1, 1, 1, 0, 0};
16106 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16107
16108 // Simple conversions of negative int32_t values. These require no rounding,
16109 // and the results should not depend on the rounding mode.
16110 CvtfTestDataSet data_set_3[] = {
16111 {0xffffc000, 0xc6800000, 0x4f7fffc0},
16112 {0xffff0000, 0xc7800000, 0x4f7fff00},
16113 {0xc0000000, 0xce800000, 0x4f400000},
16114 // Conversions which require rounding.
16115 {0x72800000, 0x4ee50000, 0x4ee50000},
16116 {0x72800001, 0x4ee50000, 0x4ee50000},
16117 {0x73000000, 0x4ee60000, 0x4ee60000},
16118 // Check rounding of negative int32_t values.
16119 {0x80000140, 0xcefffffe, 0x4f000001},
16120 {0x80000141, 0xcefffffd, 0x4f000001},
16121 {0x80000180, 0xcefffffd, 0x4f000002},
16122 // Round up to produce a result that's too big for the input to represent.
16123 {0x7fffffc0, 0x4f000000, 0x4f000000},
16124 {0x7fffffff, 0x4f000000, 0x4f000000}};
16125 int pg_3[] = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0};
16126 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16127 // clang-format on
16128}
16129
16130TEST_SVE(scvtf_ucvtf_d_to_float) {
16131 // clang-format off
16132 int dst_lane_size = kSRegSize;
16133 int src_lane_size = kDRegSize;
16134
16135 // Simple conversions of positive numbers which require no rounding; the
16136 // results should not depened on the rounding mode, and ucvtf and scvtf should
16137 // produce the same result.
16138 CvtfTestDataSet data_set_1[] = {
16139 {0x0000000000000000, 0x00000000, 0x00000000},
16140 {0x0000000000000001, 0x3f800000, 0x3f800000},
16141 {0x0000000040000000, 0x4e800000, 0x4e800000},
16142 {0x0000000100000000, 0x4f800000, 0x4f800000},
16143 {0x4000000000000000, 0x5e800000, 0x5e800000}};
16144 int pg_1[] = {1, 1, 0, 1, 0};
16145 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16146
16147 CvtfTestDataSet data_set_2[] = {
16148 // Test mantissa extremities.
16149 {0x0010000000000001, 0x59800000, 0x59800000},
16150 {0x4008000000000000, 0x5e801000, 0x5e801000},
16151 // The largest int32_t that fits in a float.
16152 {0x000000007fffff80, 0x4effffff, 0x4effffff},
16153 // Values that would be negative if treated as an int32_t.
16154 {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16155 {0x00000000ffffff00, 0x4f7fffff, 0x4f7fffff},
16156 {0x0000000080000000, 0x4f000000, 0x4f000000},
16157 {0x0000000080000100, 0x4f000001, 0x4f000001},
16158 // The largest int64_t that fits in a float.
16159 {0x7fffff8000000000, 0x5effffff, 0x5effffff},
16160 // Check for bit pattern reproduction.
16161 {0x0123456789abcde0, 0x5b91a2b4, 0x5b91a2b4},
16162 {0x0000000000876543, 0x4b076543, 0x4b076543}};
16163 int pg_2[] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1};
16164 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16165
16166 CvtfTestDataSet data_set_3[] = {
16167 // Simple conversions of negative int64_t values. These require no rounding,
16168 // and the results should not depend on the rounding mode.
16169 {0xffffffffc0000000, 0xce800000, 0x5f800000},
16170 {0xffffffff00000000, 0xcf800000, 0x5f800000},
16171 {0xc000000000000000, 0xde800000, 0x5f400000},
16172 // Conversions which require rounding.
16173 {0x0000800002800000, 0x57000002, 0x57000002},
16174 {0x0000800002800001, 0x57000003, 0x57000003},
16175 {0x0000800003000000, 0x57000003, 0x57000003},
16176 // Check rounding of negative int64_t values.
16177 {0x8000014000000000, 0xdefffffe, 0x5f000001},
16178 {0x8000014000000001, 0xdefffffd, 0x5f000001},
16179 {0x8000018000000000, 0xdefffffd, 0x5f000002},
16180 // Round up to produce a result that's too big for the input to represent.
16181 {0x00000000ffffff80, 0x4f800000, 0x4f800000},
16182 {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16183 {0xffffff8000000000, 0xd3000000, 0x5f800000},
16184 {0xffffffffffffffff, 0xbf800000, 0x5f800000}};
16185 int pg_3[] = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1};
16186 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16187 // clang-format on
16188}
16189
16190TEST_SVE(scvtf_ucvtf_d_to_double) {
16191 // clang-format off
16192 int dst_lane_size = kDRegSize;
16193 int src_lane_size = kDRegSize;
16194
16195 // Simple conversions of positive numbers which require no rounding; the
16196 // results should not depened on the rounding mode, and ucvtf and scvtf should
16197 // produce the same result.
16198 CvtfTestDataSet data_set_1[] = {
16199 {0x0000000000000000, 0x0000000000000000, 0x0000000000000000},
16200 {0x0000000000000001, 0x3ff0000000000000, 0x3ff0000000000000},
16201 {0x0000000040000000, 0x41d0000000000000, 0x41d0000000000000},
16202 {0x0000000100000000, 0x41f0000000000000, 0x41f0000000000000},
16203 {0x4000000000000000, 0x43d0000000000000, 0x43d0000000000000}};
16204 int pg_1[] = {0, 1, 1, 0, 0};
16205 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16206
16207 CvtfTestDataSet data_set_2[] = {
16208 // Test mantissa extremities.
16209 {0x0010000000000001, 0x4330000000000001, 0x4330000000000001},
16210 {0x4008000000000000, 0x43d0020000000000, 0x43d0020000000000},
16211 // The largest int32_t that fits in a double.
16212 {0x000000007fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16213 // Values that would be negative if treated as an int32_t.
16214 {0x00000000ffffffff, 0x41efffffffe00000, 0x41efffffffe00000},
16215 {0x0000000080000000, 0x41e0000000000000, 0x41e0000000000000},
16216 {0x0000000080000001, 0x41e0000000200000, 0x41e0000000200000},
16217 // The largest int64_t that fits in a double.
16218 {0x7ffffffffffffc00, 0x43dfffffffffffff, 0x43dfffffffffffff},
16219 // Check for bit pattern reproduction.
16220 {0x0123456789abcde0, 0x43723456789abcde, 0x43723456789abcde},
16221 {0x0000000012345678, 0x41b2345678000000, 0x41b2345678000000}};
16222 int pg_2[] = {1, 1, 1, 1, 1, 0, 0, 0, 0};
16223 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16224
16225 CvtfTestDataSet data_set_3[] = {
16226 // Simple conversions of negative int64_t values. These require no rounding,
16227 // and the results should not depend on the rounding mode.
16228 {0xffffffffc0000000, 0xc1d0000000000000, 0x43effffffff80000},
16229 {0xffffffff00000000, 0xc1f0000000000000, 0x43efffffffe00000},
16230 {0xc000000000000000, 0xc3d0000000000000, 0x43e8000000000000},
16231 // Conversions which require rounding.
16232 {0x1000000000000280, 0x43b0000000000002, 0x43b0000000000002},
16233 {0x1000000000000281, 0x43b0000000000003, 0x43b0000000000003},
16234 {0x1000000000000300, 0x43b0000000000003, 0x43b0000000000003},
16235 // Check rounding of negative int64_t values.
16236 {0x8000000000000a00, 0xc3dffffffffffffe, 0x43e0000000000001},
16237 {0x8000000000000a01, 0xc3dffffffffffffd, 0x43e0000000000001},
16238 {0x8000000000000c00, 0xc3dffffffffffffd, 0x43e0000000000002},
16239 // Round up to produce a result that's too big for the input to represent.
16240 {0x7ffffffffffffe00, 0x43e0000000000000, 0x43e0000000000000},
16241 {0x7fffffffffffffff, 0x43e0000000000000, 0x43e0000000000000},
16242 {0xfffffffffffffc00, 0xc090000000000000, 0x43f0000000000000},
16243 {0xffffffffffffffff, 0xbff0000000000000, 0x43f0000000000000}};
16244 int pg_3[] = {1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0};
16245 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16246 // clang-format on
16247}
16248
16249TEST_SVE(scvtf_ucvtf_s_to_double) {
16250 // clang-format off
16251 int dst_lane_size = kDRegSize;
16252 int src_lane_size = kSRegSize;
16253
16254 // Simple conversions of positive numbers which require no rounding; the
16255 // results should not depened on the rounding mode, and ucvtf and scvtf should
16256 // produce the same result.
16257 CvtfTestDataSet data_set_1[] = {
16258 {0x00000000, 0x0000000000000000, 0x0000000000000000},
16259 {0x00000001, 0x3ff0000000000000, 0x3ff0000000000000},
16260 {0x00004000, 0x40d0000000000000, 0x40d0000000000000},
16261 {0x00010000, 0x40f0000000000000, 0x40f0000000000000},
16262 {0x40000000, 0x41d0000000000000, 0x41d0000000000000}};
16263 int pg_1[] = {1, 0, 0, 0, 1};
16264 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16265
16266 CvtfTestDataSet data_set_2[] = {
16267 // Test mantissa extremities.
16268 {0x40000400, 0x41d0000100000000, 0x41d0000100000000},
16269 // The largest int32_t that fits in a double.
16270 {0x7fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16271 // Values that would be negative if treated as an int32_t.
16272 {0xffffffff, 0xbff0000000000000, 0x41efffffffe00000},
16273 {0x80000000, 0xc1e0000000000000, 0x41e0000000000000},
16274 {0x80000001, 0xc1dfffffffc00000, 0x41e0000000200000},
16275 // Check for bit pattern reproduction.
16276 {0x089abcde, 0x41a13579bc000000, 0x41a13579bc000000},
16277 {0x12345678, 0x41b2345678000000, 0x41b2345678000000},
16278 // Simple conversions of negative int32_t values. These require no rounding,
16279 // and the results should not depend on the rounding mode.
16280 {0xffffc000, 0xc0d0000000000000, 0x41effff800000000},
16281 {0xffff0000, 0xc0f0000000000000, 0x41efffe000000000},
16282 {0xc0000000, 0xc1d0000000000000, 0x41e8000000000000}};
16283 int pg_2[] = {1, 0, 1, 0, 0, 1, 1, 0, 1, 1};
16284 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16285
16286 // Note that IEEE 754 double-precision format has 52-bits fraction, so all
16287 // 32-bits integers are representable in double.
16288 // clang-format on
16289}
16290
Martyn Capewell4a9829f2020-01-30 17:41:01 +000016291TEST_SVE(sve_fadda) {
16292 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
16293 CPUFeatures::kFP,
16294 CPUFeatures::kFPHalf);
16295 START();
16296
16297 __ Ptrue(p0.VnB());
16298 __ Pfalse(p1.VnB());
16299 __ Zip1(p1.VnH(), p0.VnH(), p1.VnH());
16300
16301 __ Index(z0.VnS(), 3, 3);
16302 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16303 __ Fmov(s2, 2.0);
16304 __ Fadda(s2, p0, s2, z0.VnS());
16305
16306 __ Index(z0.VnD(), -7, -7);
16307 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16308 __ Fmov(d3, 3.0);
16309 __ Fadda(d3, p0, d3, z0.VnD());
16310
16311 __ Index(z0.VnH(), 1, 1);
16312 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16313 __ Fmov(h4, 0);
16314 __ Fadda(h4, p1, h4, z0.VnH());
16315 END();
16316
16317 if (CAN_RUN()) {
16318 RUN();
16319 // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16320 int n = core.GetSVELaneCount(kSRegSize);
16321 ASSERT_EQUAL_FP32(2 + 3 * ((n + 1) * (n / 2)), s2);
16322
16323 n /= 2; // Half as many lanes.
16324 ASSERT_EQUAL_FP64(3 + -7 * ((n + 1) * (n / 2)), d3);
16325
16326 // Sum of first n odd numbers is n^2.
16327 n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
16328 ASSERT_EQUAL_FP16(Float16(n * n), h4);
16329 }
16330}
16331
Martyn Capewellac07af12019-12-02 14:55:05 +000016332TEST_SVE(sve_extract) {
16333 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16334 START();
16335
16336 __ Index(z0.VnB(), 0, 1);
16337
16338 __ Mov(z1, z0);
16339 __ Mov(z2, z0);
16340 __ Mov(z3, z0);
16341 __ Mov(z4, z0);
16342 __ Mov(z5, z0);
16343 __ Mov(z6, z0);
16344
16345 __ Ext(z1, z1, z0, 0);
16346 __ Ext(z2, z2, z0, 1);
16347 __ Ext(z3, z3, z0, 15);
16348 __ Ext(z4, z4, z0, 31);
16349 __ Ext(z5, z5, z0, 47);
16350 __ Ext(z6, z6, z0, 255);
16351
16352 END();
16353
16354 if (CAN_RUN()) {
16355 RUN();
16356
16357 ASSERT_EQUAL_SVE(z1, z0);
16358
16359 int lane_count = core.GetSVELaneCount(kBRegSize);
16360 if (lane_count == 16) {
16361 uint64_t z2_expected[] = {0x000f0e0d0c0b0a09, 0x0807060504030201};
16362 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16363 } else {
16364 uint64_t z2_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
16365 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16366 }
16367
16368 if (lane_count == 16) {
16369 uint64_t z3_expected[] = {0x0e0d0c0b0a090807, 0x060504030201000f};
16370 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16371 } else {
16372 uint64_t z3_expected[] = {0x1e1d1c1b1a191817, 0x161514131211100f};
16373 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16374 }
16375
16376 if (lane_count < 32) {
16377 ASSERT_EQUAL_SVE(z4, z0);
16378 } else if (lane_count == 32) {
16379 uint64_t z4_expected[] = {0x0e0d0c0b0a090807, 0x060504030201001f};
16380 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16381 } else {
16382 uint64_t z4_expected[] = {0x2e2d2c2b2a292827, 0x262524232221201f};
16383 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16384 }
16385
16386 if (lane_count < 48) {
16387 ASSERT_EQUAL_SVE(z5, z0);
16388 } else if (lane_count == 48) {
16389 uint64_t z5_expected[] = {0x0e0d0c0b0a090807, 0x060504030201002f};
16390 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16391 } else {
16392 uint64_t z5_expected[] = {0x3e3d3c3b3a393837, 0x363534333231302f};
16393 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16394 }
16395
16396 if (lane_count < 256) {
16397 ASSERT_EQUAL_SVE(z6, z0);
16398 } else {
16399 uint64_t z6_expected[] = {0x0e0d0c0b0a090807, 0x06050403020100ff};
16400 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16401 }
16402 }
16403}
16404
Martyn Capewell894962f2020-02-05 15:46:44 +000016405TEST_SVE(sve_fp_paired_across) {
16406 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16407
16408 START();
16409
16410 __ Ptrue(p0.VnB());
16411 __ Pfalse(p1.VnB());
16412 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
16413 __ Zip1(p3.VnD(), p0.VnD(), p1.VnD());
16414 __ Zip1(p4.VnH(), p0.VnH(), p1.VnH());
16415
16416 __ Index(z0.VnS(), 3, 3);
16417 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16418 __ Faddv(s1, p0, z0.VnS());
16419 __ Fminv(s2, p2, z0.VnS());
16420 __ Fmaxv(s3, p2, z0.VnS());
16421
16422 __ Index(z0.VnD(), -7, -7);
16423 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16424 __ Faddv(d4, p0, z0.VnD());
16425 __ Fminv(d5, p3, z0.VnD());
16426 __ Fmaxv(d6, p3, z0.VnD());
16427
16428 __ Index(z0.VnH(), 1, 1);
16429 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16430 __ Faddv(h7, p4, z0.VnH());
16431 __ Fminv(h8, p4, z0.VnH());
16432 __ Fmaxv(h9, p4, z0.VnH());
16433
16434 __ Dup(z10.VnH(), 0);
16435 __ Fdiv(z10.VnH(), p0.Merging(), z10.VnH(), z10.VnH());
16436 __ Insr(z10.VnH(), 0x5140);
16437 __ Insr(z10.VnH(), 0xd140);
16438 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 2);
16439 __ Fmaxnmv(h11, p0, z10.VnH());
16440 __ Fmaxnmv(h12, p4, z10.VnH());
16441 __ Fminnmv(h13, p0, z10.VnH());
16442 __ Fminnmv(h14, p4, z10.VnH());
16443
16444 __ Dup(z10.VnS(), 0);
16445 __ Fdiv(z10.VnS(), p0.Merging(), z10.VnS(), z10.VnS());
16446 __ Insr(z10.VnS(), 0x42280000);
16447 __ Insr(z10.VnS(), 0xc2280000);
16448 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 4);
16449 __ Fmaxnmv(s15, p0, z10.VnS());
16450 __ Fmaxnmv(s16, p2, z10.VnS());
16451 __ Fminnmv(s17, p0, z10.VnS());
16452 __ Fminnmv(s18, p2, z10.VnS());
16453
16454 __ Dup(z10.VnD(), 0);
16455 __ Fdiv(z10.VnD(), p0.Merging(), z10.VnD(), z10.VnD());
16456 __ Insr(z10.VnD(), 0x4045000000000000);
16457 __ Insr(z10.VnD(), 0xc045000000000000);
16458 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 8);
16459 __ Fmaxnmv(d19, p0, z10.VnD());
16460 __ Fmaxnmv(d20, p3, z10.VnD());
16461 __ Fminnmv(d21, p0, z10.VnD());
16462 __ Fminnmv(d22, p3, z10.VnD());
16463 END();
16464
16465 if (CAN_RUN()) {
16466 RUN();
16467 // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16468 int n = core.GetSVELaneCount(kSRegSize);
16469 ASSERT_EQUAL_FP32(3 * ((n + 1) * (n / 2)), s1);
16470 ASSERT_EQUAL_FP32(3, s2);
16471 ASSERT_EQUAL_FP32(3 * n - 3, s3);
16472
16473 n /= 2; // Half as many lanes.
16474 ASSERT_EQUAL_FP64(-7 * ((n + 1) * (n / 2)), d4);
16475 ASSERT_EQUAL_FP64(-7 * (n - 1), d5);
16476 ASSERT_EQUAL_FP64(-7, d6);
16477
16478 // Sum of first n odd numbers is n^2.
16479 n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
16480 ASSERT_EQUAL_FP16(Float16(n * n), h7);
16481 ASSERT_EQUAL_FP16(Float16(1), h8);
16482
16483 n = core.GetSVELaneCount(kHRegSize);
16484 ASSERT_EQUAL_FP16(Float16(n - 1), h9);
16485
16486 ASSERT_EQUAL_FP16(Float16(42), h11);
16487 ASSERT_EQUAL_FP16(Float16(42), h12);
16488 ASSERT_EQUAL_FP16(Float16(-42), h13);
16489 ASSERT_EQUAL_FP16(Float16(42), h14);
16490 ASSERT_EQUAL_FP32(42, s15);
16491 ASSERT_EQUAL_FP32(42, s16);
16492 ASSERT_EQUAL_FP32(-42, s17);
16493 ASSERT_EQUAL_FP32(42, s18);
16494 ASSERT_EQUAL_FP64(42, d19);
16495 ASSERT_EQUAL_FP64(42, d20);
16496 ASSERT_EQUAL_FP64(-42, d21);
16497 ASSERT_EQUAL_FP64(42, d22);
16498 }
16499}
16500
Martyn Capewell13050ca2020-02-11 16:43:40 +000016501TEST_SVE(sve_frecpe_frsqrte) {
16502 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16503
16504 START();
16505
16506 __ Ptrue(p0.VnB());
16507
16508 __ Index(z0.VnH(), 0, 1);
16509 __ Fdup(z1.VnH(), Float16(1));
16510 __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16511 __ Insr(z1.VnH(), 0);
16512 __ Frsqrte(z2.VnH(), z1.VnH());
16513 __ Frecpe(z1.VnH(), z1.VnH());
16514
16515 __ Index(z0.VnS(), 0, 1);
16516 __ Fdup(z3.VnS(), Float16(1));
16517 __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16518 __ Insr(z3.VnS(), 0);
16519 __ Frsqrte(z4.VnS(), z3.VnS());
16520 __ Frecpe(z3.VnS(), z3.VnS());
16521
16522 __ Index(z0.VnD(), 0, 1);
16523 __ Fdup(z5.VnD(), Float16(1));
16524 __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16525 __ Insr(z5.VnD(), 0);
16526 __ Frsqrte(z6.VnD(), z5.VnD());
16527 __ Frecpe(z5.VnD(), z5.VnD());
16528 END();
16529
16530 if (CAN_RUN()) {
16531 RUN();
16532 uint64_t z1_expected[] = {0x23fc27fc2bfc2ffc, 0x33fc37fc3bfc7c00};
16533 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16534 uint64_t z2_expected[] = {0x2ffc31a433fc35a4, 0x37fc39a43bfc7c00};
16535 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16536
16537 uint64_t z3_expected[] = {0x3e7f80003eff8000, 0x3f7f80007f800000};
16538 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16539 uint64_t z4_expected[] = {0x3eff80003f348000, 0x3f7f80007f800000};
16540 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16541
16542 uint64_t z5_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16543 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16544 uint64_t z6_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16545 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16546 }
16547}
16548
Martyn Capewellefd9dc72020-02-13 10:46:29 +000016549TEST_SVE(sve_frecps_frsqrts) {
16550 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16551
16552 START();
16553 __ Ptrue(p0.VnB());
16554
16555 __ Index(z0.VnH(), 0, -1);
16556 __ Fdup(z1.VnH(), Float16(1));
16557 __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16558 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16559 __ Insr(z1.VnH(), 0);
16560 __ Frsqrts(z2.VnH(), z1.VnH(), z0.VnH());
16561 __ Frecps(z1.VnH(), z1.VnH(), z0.VnH());
16562
16563 __ Index(z0.VnS(), 0, -1);
16564 __ Fdup(z3.VnS(), Float16(1));
16565 __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16566 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16567 __ Insr(z3.VnS(), 0);
16568 __ Frsqrts(z4.VnS(), z3.VnS(), z0.VnS());
16569 __ Frecps(z3.VnS(), z3.VnS(), z0.VnS());
16570
16571 __ Index(z0.VnD(), 0, -1);
16572 __ Fdup(z5.VnD(), Float16(1));
16573 __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16574 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16575 __ Insr(z5.VnD(), 0);
16576 __ Frsqrts(z6.VnD(), z5.VnD(), z0.VnD());
16577 __ Frecps(z5.VnD(), z5.VnD(), z0.VnD());
16578 END();
16579
16580 if (CAN_RUN()) {
16581 RUN();
16582 uint64_t z1_expected[] = {0x4038406040a04100, 0x4180420042004000};
16583 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16584 uint64_t z2_expected[] = {0x3e383e603ea03f00, 0x3f80400040003e00};
16585 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16586
16587 uint64_t z3_expected[] = {0x4030000040400000, 0x4040000040000000};
16588 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16589 uint64_t z4_expected[] = {0x3ff0000040000000, 0x400000003fc00000};
16590 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16591
16592 uint64_t z5_expected[] = {0x4008000000000000, 0x4000000000000000};
16593 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16594 uint64_t z6_expected[] = {0x4000000000000000, 0x3ff8000000000000};
16595 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16596 }
16597}
16598
16599TEST_SVE(sve_ftsmul) {
16600 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16601
16602 START();
16603 __ Ptrue(p0.VnB());
16604
16605 __ Index(z0.VnH(), 0, 1);
16606 __ Rev(z1.VnH(), z0.VnH());
16607 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16608 __ Dup(z2.VnH(), 0);
16609 __ Fdiv(z2.VnH(), p0.Merging(), z2.VnH(), z2.VnH());
16610 __ Ftsmul(z3.VnH(), z0.VnH(), z1.VnH());
16611 __ Ftsmul(z4.VnH(), z2.VnH(), z1.VnH());
16612
16613 __ Index(z0.VnS(), -7, 1);
16614 __ Rev(z1.VnS(), z0.VnS());
16615 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16616 __ Dup(z2.VnS(), 0);
16617 __ Fdiv(z2.VnS(), p0.Merging(), z2.VnS(), z2.VnS());
16618 __ Ftsmul(z5.VnS(), z0.VnS(), z1.VnS());
16619 __ Ftsmul(z6.VnS(), z2.VnS(), z1.VnS());
16620
16621 __ Index(z0.VnD(), 2, -1);
16622 __ Rev(z1.VnD(), z0.VnD());
16623 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16624 __ Dup(z2.VnD(), 0);
16625 __ Fdiv(z2.VnD(), p0.Merging(), z2.VnD(), z2.VnD());
16626 __ Ftsmul(z7.VnD(), z0.VnD(), z1.VnD());
16627 __ Ftsmul(z8.VnD(), z2.VnD(), z1.VnD());
16628 END();
16629
16630 if (CAN_RUN()) {
16631 RUN();
16632 uint64_t z3_expected[] = {0x5220d0804e40cc00, 0x4880c4003c008000};
16633 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16634 uint64_t z4_expected[] = {0x7e007e007e007e00, 0x7e007e007e007e00};
16635 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16636
Jacob Bramleydfb93b52020-07-02 12:06:45 +010016637 uint64_t z5_expected[] = {0xc180000041c80000, 0xc210000042440000};
Martyn Capewellefd9dc72020-02-13 10:46:29 +000016638 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16639 uint64_t z6_expected[] = {0x7fc000007fc00000, 0x7fc000007fc00000};
16640 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16641
16642 uint64_t z7_expected[] = {0x3ff0000000000000, 0xc010000000000000};
16643 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
16644 uint64_t z8_expected[] = {0x7ff8000000000000, 0x7ff8000000000000};
16645 ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
16646 }
16647}
TatWai Chongf8d29f12020-02-16 22:53:18 -080016648
16649typedef void (MacroAssembler::*FPMulAccFn)(
16650 const ZRegister& zd,
16651 const PRegisterM& pg,
16652 const ZRegister& za,
16653 const ZRegister& zn,
16654 const ZRegister& zm,
16655 FPMacroNaNPropagationOption nan_option);
16656
16657// The `pg_inputs` is used for examining the predication correctness internally.
16658// It does not imply the value of `result` argument. `result` stands for the
16659// expected result on all-true predication.
16660template <typename T, size_t N>
16661static void FPMulAccHelper(
16662 Test* config,
16663 FPMulAccFn macro,
16664 unsigned lane_size_in_bits,
16665 const int (&pg_inputs)[N],
16666 const T (&za_inputs)[N],
16667 const T (&zn_inputs)[N],
16668 const T (&zm_inputs)[N],
16669 const uint64_t (&result)[N],
16670 FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
16671 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16672 START();
16673
16674 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
16675 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
16676 ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
16677 ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
16678
16679 uint64_t za_rawbits[N];
16680 uint64_t zn_rawbits[N];
16681 uint64_t zm_rawbits[N];
16682
16683 FPToRawbitsWithSize(za_inputs, za_rawbits, lane_size_in_bits);
16684 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
16685 FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
16686
16687 InsrHelper(&masm, za, za_rawbits);
16688 InsrHelper(&masm, zn, zn_rawbits);
16689 InsrHelper(&masm, zm, zm_rawbits);
16690
TatWai Chong2cb1b612020-03-04 23:51:21 -080016691 // Initialize `zd` with a signalling NaN.
16692 uint64_t sn = GetSignallingNan(lane_size_in_bits);
16693 __ Mov(x29, sn);
16694 __ Dup(zd, x29);
TatWai Chongf8d29f12020-02-16 22:53:18 -080016695
16696 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
16697
16698 // Fmla macro automatically selects between fmla, fmad and movprfx + fmla
16699 // Fmls `ditto` fmls, fmsb and movprfx + fmls
16700 // Fnmla `ditto` fnmla, fnmad and movprfx + fnmla
16701 // Fnmls `ditto` fnmls, fnmsb and movprfx + fnmls
16702 // based on what registers are aliased.
16703 ZRegister da_result = z10.WithLaneSize(lane_size_in_bits);
16704 ZRegister dn_result = z11.WithLaneSize(lane_size_in_bits);
16705 ZRegister dm_result = z12.WithLaneSize(lane_size_in_bits);
16706 ZRegister d_result = z13.WithLaneSize(lane_size_in_bits);
16707
16708 __ Mov(da_result, za);
16709 (masm.*macro)(da_result, p0.Merging(), da_result, zn, zm, nan_option);
16710
16711 __ Mov(dn_result, zn);
16712 (masm.*macro)(dn_result, p0.Merging(), za, dn_result, zm, nan_option);
16713
16714 __ Mov(dm_result, zm);
16715 (masm.*macro)(dm_result, p0.Merging(), za, zn, dm_result, nan_option);
16716
16717 __ Mov(d_result, zd);
16718 (masm.*macro)(d_result, p0.Merging(), za, zn, zm, nan_option);
16719
16720 END();
16721
16722 if (CAN_RUN()) {
16723 RUN();
16724
16725 ASSERT_EQUAL_SVE(za_rawbits, za);
16726 ASSERT_EQUAL_SVE(zn_rawbits, zn);
16727 ASSERT_EQUAL_SVE(zm_rawbits, zm);
16728
16729 uint64_t da_expected[N];
16730 uint64_t dn_expected[N];
16731 uint64_t dm_expected[N];
16732 uint64_t d_expected[N];
16733 for (size_t i = 0; i < N; i++) {
16734 da_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : za_rawbits[i];
16735 dn_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zn_rawbits[i];
16736 dm_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zm_rawbits[i];
TatWai Chong2cb1b612020-03-04 23:51:21 -080016737 d_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : sn;
TatWai Chongf8d29f12020-02-16 22:53:18 -080016738 }
16739
16740 ASSERT_EQUAL_SVE(da_expected, da_result);
16741 ASSERT_EQUAL_SVE(dn_expected, dn_result);
16742 ASSERT_EQUAL_SVE(dm_expected, dm_result);
16743 ASSERT_EQUAL_SVE(d_expected, d_result);
16744 }
16745}
16746
16747TEST_SVE(sve_fmla_fmad) {
16748 // fmla : zd = za + zn * zm
16749 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16750 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16751 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16752 int pg_inputs[] = {1, 1, 0, 1};
16753
16754 uint64_t fmla_result_h[] = {Float16ToRawbits(Float16(-84.0)),
16755 Float16ToRawbits(Float16(101.0)),
16756 Float16ToRawbits(Float16(33.0)),
16757 Float16ToRawbits(Float16(42.0))};
16758
16759 // `fmad` has been tested in the helper.
16760 FPMulAccHelper(config,
16761 &MacroAssembler::Fmla,
16762 kHRegSize,
16763 pg_inputs,
16764 za_inputs,
16765 zn_inputs,
16766 zm_inputs,
16767 fmla_result_h);
16768
16769 uint64_t fmla_result_s[] = {FloatToRawbits(-84.0f),
16770 FloatToRawbits(101.0f),
16771 FloatToRawbits(33.0f),
16772 FloatToRawbits(42.0f)};
16773
16774 FPMulAccHelper(config,
16775 &MacroAssembler::Fmla,
16776 kSRegSize,
16777 pg_inputs,
16778 za_inputs,
16779 zn_inputs,
16780 zm_inputs,
16781 fmla_result_s);
16782
16783 uint64_t fmla_result_d[] = {DoubleToRawbits(-84.0),
16784 DoubleToRawbits(101.0),
16785 DoubleToRawbits(33.0),
16786 DoubleToRawbits(42.0)};
16787
16788 FPMulAccHelper(config,
16789 &MacroAssembler::Fmla,
16790 kDRegSize,
16791 pg_inputs,
16792 za_inputs,
16793 zn_inputs,
16794 zm_inputs,
16795 fmla_result_d);
16796}
16797
16798TEST_SVE(sve_fmls_fmsb) {
16799 // fmls : zd = za - zn * zm
16800 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16801 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16802 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16803 int pg_inputs[] = {1, 0, 1, 1};
16804
16805 uint64_t fmls_result_h[] = {Float16ToRawbits(Float16(6.0)),
16806 Float16ToRawbits(Float16(-99.0)),
16807 Float16ToRawbits(Float16(-39.0)),
16808 Float16ToRawbits(Float16(-38.0))};
16809
16810 // `fmsb` has been tested in the helper.
16811 FPMulAccHelper(config,
16812 &MacroAssembler::Fmls,
16813 kHRegSize,
16814 pg_inputs,
16815 za_inputs,
16816 zn_inputs,
16817 zm_inputs,
16818 fmls_result_h);
16819
16820 uint64_t fmls_result_s[] = {FloatToRawbits(6.0f),
16821 FloatToRawbits(-99.0f),
16822 FloatToRawbits(-39.0f),
16823 FloatToRawbits(-38.0f)};
16824
16825 FPMulAccHelper(config,
16826 &MacroAssembler::Fmls,
16827 kSRegSize,
16828 pg_inputs,
16829 za_inputs,
16830 zn_inputs,
16831 zm_inputs,
16832 fmls_result_s);
16833
16834 uint64_t fmls_result_d[] = {DoubleToRawbits(6.0),
16835 DoubleToRawbits(-99.0),
16836 DoubleToRawbits(-39.0),
16837 DoubleToRawbits(-38.0)};
16838
16839 FPMulAccHelper(config,
16840 &MacroAssembler::Fmls,
16841 kDRegSize,
16842 pg_inputs,
16843 za_inputs,
16844 zn_inputs,
16845 zm_inputs,
16846 fmls_result_d);
16847}
16848
16849TEST_SVE(sve_fnmla_fnmad) {
16850 // fnmla : zd = -za - zn * zm
16851 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16852 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16853 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16854 int pg_inputs[] = {0, 1, 1, 1};
16855
16856 uint64_t fnmla_result_h[] = {Float16ToRawbits(Float16(84.0)),
16857 Float16ToRawbits(Float16(-101.0)),
16858 Float16ToRawbits(Float16(-33.0)),
16859 Float16ToRawbits(Float16(-42.0))};
16860
16861 // `fnmad` has been tested in the helper.
16862 FPMulAccHelper(config,
16863 &MacroAssembler::Fnmla,
16864 kHRegSize,
16865 pg_inputs,
16866 za_inputs,
16867 zn_inputs,
16868 zm_inputs,
16869 fnmla_result_h);
16870
16871 uint64_t fnmla_result_s[] = {FloatToRawbits(84.0f),
16872 FloatToRawbits(-101.0f),
16873 FloatToRawbits(-33.0f),
16874 FloatToRawbits(-42.0f)};
16875
16876 FPMulAccHelper(config,
16877 &MacroAssembler::Fnmla,
16878 kSRegSize,
16879 pg_inputs,
16880 za_inputs,
16881 zn_inputs,
16882 zm_inputs,
16883 fnmla_result_s);
16884
16885 uint64_t fnmla_result_d[] = {DoubleToRawbits(84.0),
16886 DoubleToRawbits(-101.0),
16887 DoubleToRawbits(-33.0),
16888 DoubleToRawbits(-42.0)};
16889
16890 FPMulAccHelper(config,
16891 &MacroAssembler::Fnmla,
16892 kDRegSize,
16893 pg_inputs,
16894 za_inputs,
16895 zn_inputs,
16896 zm_inputs,
16897 fnmla_result_d);
16898}
16899
16900TEST_SVE(sve_fnmls_fnmsb) {
16901 // fnmls : zd = -za + zn * zm
16902 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16903 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16904 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16905 int pg_inputs[] = {1, 1, 1, 0};
16906
16907 uint64_t fnmls_result_h[] = {Float16ToRawbits(Float16(-6.0)),
16908 Float16ToRawbits(Float16(99.0)),
16909 Float16ToRawbits(Float16(39.0)),
16910 Float16ToRawbits(Float16(38.0))};
16911
16912 // `fnmsb` has been tested in the helper.
16913 FPMulAccHelper(config,
16914 &MacroAssembler::Fnmls,
16915 kHRegSize,
16916 pg_inputs,
16917 za_inputs,
16918 zn_inputs,
16919 zm_inputs,
16920 fnmls_result_h);
16921
16922 uint64_t fnmls_result_s[] = {FloatToRawbits(-6.0f),
16923 FloatToRawbits(99.0f),
16924 FloatToRawbits(39.0f),
16925 FloatToRawbits(38.0f)};
16926
16927 FPMulAccHelper(config,
16928 &MacroAssembler::Fnmls,
16929 kSRegSize,
16930 pg_inputs,
16931 za_inputs,
16932 zn_inputs,
16933 zm_inputs,
16934 fnmls_result_s);
16935
16936 uint64_t fnmls_result_d[] = {DoubleToRawbits(-6.0),
16937 DoubleToRawbits(99.0),
16938 DoubleToRawbits(39.0),
16939 DoubleToRawbits(38.0)};
16940
16941 FPMulAccHelper(config,
16942 &MacroAssembler::Fnmls,
16943 kDRegSize,
16944 pg_inputs,
16945 za_inputs,
16946 zn_inputs,
16947 zm_inputs,
16948 fnmls_result_d);
16949}
16950
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016951typedef void (MacroAssembler::*FPMulAccIdxFn)(const ZRegister& zd,
16952 const ZRegister& za,
16953 const ZRegister& zn,
16954 const ZRegister& zm,
16955 int index);
16956
16957template <typename T, size_t N>
16958static void FPMulAccIdxHelper(Test* config,
16959 FPMulAccFn macro,
16960 FPMulAccIdxFn macro_idx,
16961 const T (&za_inputs)[N],
16962 const T (&zn_inputs)[N],
16963 const T (&zm_inputs)[N]) {
16964 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16965 START();
16966
Martyn Capewellc7501512020-03-16 10:35:33 +000016967 __ Ptrue(p0.VnB());
16968
16969 // Repeat indexed vector across up to 2048-bit VL.
16970 for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i += N) {
16971 InsrHelper(&masm, z30.VnD(), zm_inputs);
16972 }
16973
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070016974 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z30.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000016975
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016976 InsrHelper(&masm, z1.VnD(), zn_inputs);
16977 InsrHelper(&masm, z2.VnD(), za_inputs);
16978
16979 __ Mov(z3, z0);
16980 (masm.*macro_idx)(z3.VnH(), z2.VnH(), z1.VnH(), z3.VnH(), 0); // zd == zm
16981 __ Mov(z4, z1);
16982 (masm.*macro_idx)(z4.VnH(), z2.VnH(), z4.VnH(), z0.VnH(), 1); // zd == zn
16983 __ Mov(z5, z2);
16984 (masm.*macro_idx)(z5.VnH(), z5.VnH(), z1.VnH(), z0.VnH(), 4); // zd == za
16985 (masm.*macro_idx)(z6.VnH(), z2.VnH(), z1.VnH(), z0.VnH(), 7);
16986
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070016987 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z30.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000016988
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016989 __ Mov(z7, z0);
16990 (masm.*macro_idx)(z7.VnS(), z2.VnS(), z1.VnS(), z7.VnS(), 0); // zd == zm
16991 __ Mov(z8, z1);
16992 (masm.*macro_idx)(z8.VnS(), z2.VnS(), z8.VnS(), z0.VnS(), 1); // zd == zn
16993 __ Mov(z9, z2);
16994 (masm.*macro_idx)(z9.VnS(), z9.VnS(), z1.VnS(), z0.VnS(), 2); // zd == za
16995 (masm.*macro_idx)(z10.VnS(), z2.VnS(), z1.VnS(), z0.VnS(), 3);
16996
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070016997 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
Martyn Capewellc7501512020-03-16 10:35:33 +000016998
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016999 __ Mov(z11, z0);
17000 (masm.*macro_idx)(z11.VnD(), z2.VnD(), z1.VnD(), z11.VnD(), 0); // zd == zm
17001 __ Mov(z12, z1);
17002 (masm.*macro_idx)(z12.VnD(), z2.VnD(), z12.VnD(), z0.VnD(), 1); // zd == zn
17003 __ Mov(z13, z2);
17004 (masm.*macro_idx)(z13.VnD(), z13.VnD(), z1.VnD(), z0.VnD(), 0); // zd == za
17005 __ Mov(z14, z0);
17006 // zd == zn == zm
17007 (masm.*macro_idx)(z14.VnD(), z2.VnD(), z14.VnD(), z14.VnD(), 1);
17008
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017009 // Indexed form of Fmla and Fmls won't swap argument, passing strict NaN
17010 // propagation mode to ensure the following macros don't swap argument in
17011 // any cases.
17012 FPMacroNaNPropagationOption option = StrictNaNPropagation;
17013 // Compute the results using other instructions.
Martyn Capewellc7501512020-03-16 10:35:33 +000017014 __ Dup(z0.VnH(), z30.VnH(), 0);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017015 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017016 (masm.*macro)(z15.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17017 __ Dup(z0.VnH(), z30.VnH(), 1);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017018 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017019 (masm.*macro)(z16.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17020 __ Dup(z0.VnH(), z30.VnH(), 4);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017021 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017022 (masm.*macro)(z17.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17023 __ Dup(z0.VnH(), z30.VnH(), 7);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017024 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017025 (masm.*macro)(z18.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017026
Martyn Capewellc7501512020-03-16 10:35:33 +000017027 __ Dup(z0.VnS(), z30.VnS(), 0);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017028 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017029 (masm.*macro)(z19.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17030 __ Dup(z0.VnS(), z30.VnS(), 1);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017031 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017032 (masm.*macro)(z20.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17033 __ Dup(z0.VnS(), z30.VnS(), 2);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017034 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017035 (masm.*macro)(z21.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17036 __ Dup(z0.VnS(), z30.VnS(), 3);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017037 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017038 (masm.*macro)(z22.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017039
Martyn Capewellc7501512020-03-16 10:35:33 +000017040 __ Dup(z0.VnD(), z30.VnD(), 0);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017041 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
Martyn Capewellc7501512020-03-16 10:35:33 +000017042 (masm.*macro)(z23.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
17043 __ Dup(z0.VnD(), z30.VnD(), 1);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017044 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
Martyn Capewellc7501512020-03-16 10:35:33 +000017045 (masm.*macro)(z24.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
Jacob Bramley8caa8732020-07-01 20:22:38 +010017046 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
17047 __ Dup(z29.VnD(), z30.VnD(), 1);
17048 FPSegmentPatternHelper(&masm, z29.VnD(), p0.Merging(), z29.VnD());
17049 (masm.*macro)(z25.VnD(), p0.Merging(), z2.VnD(), z0.VnD(), z29.VnD(), option);
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017050
17051 END();
17052
17053 if (CAN_RUN()) {
17054 RUN();
17055
17056 ASSERT_EQUAL_SVE(z15.VnH(), z3.VnH());
17057 ASSERT_EQUAL_SVE(z16.VnH(), z4.VnH());
17058 ASSERT_EQUAL_SVE(z17.VnH(), z5.VnH());
17059 ASSERT_EQUAL_SVE(z18.VnH(), z6.VnH());
17060
17061 ASSERT_EQUAL_SVE(z19.VnS(), z7.VnS());
17062 ASSERT_EQUAL_SVE(z20.VnS(), z8.VnS());
17063 ASSERT_EQUAL_SVE(z21.VnS(), z9.VnS());
17064 ASSERT_EQUAL_SVE(z22.VnS(), z10.VnS());
17065
17066 ASSERT_EQUAL_SVE(z23.VnD(), z11.VnD());
17067 ASSERT_EQUAL_SVE(z24.VnD(), z12.VnD());
17068 ASSERT_EQUAL_SVE(z11.VnD(), z13.VnD());
17069 ASSERT_EQUAL_SVE(z25.VnD(), z14.VnD());
17070 }
17071}
17072
17073TEST_SVE(sve_fmla_fmls_index) {
17074 uint64_t zm_inputs_1[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
17075 uint64_t zn_inputs_1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
17076 uint64_t za_inputs_1[] = {0x3c004000bc00c000, 0x64006800e400e800};
17077
17078 // Using the vector form of Fmla and Fmls to verify the indexed form.
17079 FPMulAccIdxHelper(config,
17080 &MacroAssembler::Fmla, // vector form
17081 &MacroAssembler::Fmla, // indexed form
17082 za_inputs_1,
17083 zn_inputs_1,
17084 zm_inputs_1);
17085
17086 FPMulAccIdxHelper(config,
17087 &MacroAssembler::Fmls, // vector form
17088 &MacroAssembler::Fmls, // indexed form
17089 za_inputs_1,
17090 zn_inputs_1,
17091 zm_inputs_1);
17092
17093 uint64_t zm_inputs_2[] = {0x7ff5555511111111, // NaN
17094 0xfff0000000000000}; // Infinity
17095 uint64_t zn_inputs_2[] = {0x7f9511117fc00000, // NaN
17096 0x7f800000ff800000}; // Infinity
17097 uint64_t za_inputs_2[] = {0x7c11000000007e00, // NaN
17098 0x000000007c00fc00}; // Infinity
17099 FPMulAccIdxHelper(config,
17100 &MacroAssembler::Fmla, // vector form
17101 &MacroAssembler::Fmla, // indexed form
17102 za_inputs_2,
17103 zn_inputs_2,
17104 zm_inputs_2);
17105
17106 FPMulAccIdxHelper(config,
17107 &MacroAssembler::Fmls, // vector form
17108 &MacroAssembler::Fmls, // indexed form
17109 za_inputs_2,
17110 zn_inputs_2,
17111 zm_inputs_2);
17112}
17113
TatWai Chongf8d29f12020-02-16 22:53:18 -080017114// Execute a number of instructions which all use ProcessNaNs, and check that
17115// they all propagate NaNs correctly.
17116template <typename Ti, typename Td, size_t N>
17117static void ProcessNaNsHelper(Test* config,
17118 int lane_size_in_bits,
17119 const Ti (&zn_inputs)[N],
17120 const Ti (&zm_inputs)[N],
17121 const Td (&zd_expected)[N],
17122 FPMacroNaNPropagationOption nan_option) {
17123 ArithFn arith_unpredicated_macro[] = {&MacroAssembler::Fadd,
17124 &MacroAssembler::Fsub,
17125 &MacroAssembler::Fmul};
17126
17127 for (size_t i = 0; i < ArrayLength(arith_unpredicated_macro); i++) {
17128 FPBinArithHelper(config,
17129 arith_unpredicated_macro[i],
17130 lane_size_in_bits,
17131 zn_inputs,
17132 zm_inputs,
17133 zd_expected);
17134 }
17135
17136 FPArithPredicatedFn arith_predicated_macro[] = {&MacroAssembler::Fmax,
17137 &MacroAssembler::Fmin};
17138 int pg_inputs[N];
17139 // With an all-true predicate, this helper aims to compare with special
17140 // numbers.
17141 for (size_t i = 0; i < N; i++) {
17142 pg_inputs[i] = 1;
17143 }
17144
17145 // fdivr propagates the quotient (Zm) preferentially, so we don't actually
17146 // need any special handling for StrictNaNPropagation.
17147 FPBinArithHelper(config,
17148 NULL,
17149 &MacroAssembler::Fdiv,
17150 lane_size_in_bits,
17151 // With an all-true predicate, the value in zd is
17152 // irrelevant to the operations.
17153 zn_inputs,
17154 pg_inputs,
17155 zn_inputs,
17156 zm_inputs,
17157 zd_expected);
17158
17159 for (size_t i = 0; i < ArrayLength(arith_predicated_macro); i++) {
17160 FPBinArithHelper(config,
17161 arith_predicated_macro[i],
17162 NULL,
17163 lane_size_in_bits,
17164 // With an all-true predicate, the value in zd is
17165 // irrelevant to the operations.
17166 zn_inputs,
17167 pg_inputs,
17168 zn_inputs,
17169 zm_inputs,
17170 zd_expected,
17171 nan_option);
17172 }
17173}
17174
17175template <typename Ti, typename Td, size_t N>
17176static void ProcessNaNsHelper3(Test* config,
17177 int lane_size_in_bits,
17178 const Ti (&za_inputs)[N],
17179 const Ti (&zn_inputs)[N],
17180 const Ti (&zm_inputs)[N],
17181 const Td (&zd_expected_fmla)[N],
17182 const Td (&zd_expected_fmls)[N],
17183 const Td (&zd_expected_fnmla)[N],
17184 const Td (&zd_expected_fnmls)[N],
17185 FPMacroNaNPropagationOption nan_option) {
17186 int pg_inputs[N];
17187 // With an all-true predicate, this helper aims to compare with special
17188 // numbers.
17189 for (size_t i = 0; i < N; i++) {
17190 pg_inputs[i] = 1;
17191 }
17192
17193 FPMulAccHelper(config,
17194 &MacroAssembler::Fmla,
17195 lane_size_in_bits,
17196 pg_inputs,
17197 za_inputs,
17198 zn_inputs,
17199 zm_inputs,
17200 zd_expected_fmla,
17201 nan_option);
17202
17203 FPMulAccHelper(config,
17204 &MacroAssembler::Fmls,
17205 lane_size_in_bits,
17206 pg_inputs,
17207 za_inputs,
17208 zn_inputs,
17209 zm_inputs,
17210 zd_expected_fmls,
17211 nan_option);
17212
17213 FPMulAccHelper(config,
17214 &MacroAssembler::Fnmla,
17215 lane_size_in_bits,
17216 pg_inputs,
17217 za_inputs,
17218 zn_inputs,
17219 zm_inputs,
17220 zd_expected_fnmla,
17221 nan_option);
17222
17223 FPMulAccHelper(config,
17224 &MacroAssembler::Fnmls,
17225 lane_size_in_bits,
17226 pg_inputs,
17227 za_inputs,
17228 zn_inputs,
17229 zm_inputs,
17230 zd_expected_fnmls,
17231 nan_option);
17232}
17233
17234TEST_SVE(sve_process_nans_double) {
17235 // Use non-standard NaNs to check that the payload bits are preserved.
17236 double sa = RawbitsToDouble(0x7ff5555511111111);
17237 double sn = RawbitsToDouble(0x7ff5555522222222);
17238 double sm = RawbitsToDouble(0x7ff5555533333333);
17239 double qa = RawbitsToDouble(0x7ffaaaaa11111111);
17240 double qn = RawbitsToDouble(0x7ffaaaaa22222222);
17241 double qm = RawbitsToDouble(0x7ffaaaaa33333333);
17242 VIXL_ASSERT(IsSignallingNaN(sa));
17243 VIXL_ASSERT(IsSignallingNaN(sn));
17244 VIXL_ASSERT(IsSignallingNaN(sm));
17245 VIXL_ASSERT(IsQuietNaN(qa));
17246 VIXL_ASSERT(IsQuietNaN(qn));
17247 VIXL_ASSERT(IsQuietNaN(qm));
17248
17249 // The input NaNs after passing through ProcessNaN.
17250 uint64_t sa_proc = 0x7ffd555511111111;
17251 uint64_t sn_proc = 0x7ffd555522222222;
17252 uint64_t sm_proc = 0x7ffd555533333333;
17253 uint64_t qa_proc = DoubleToRawbits(qa);
17254 uint64_t qn_proc = DoubleToRawbits(qn);
17255 uint64_t qm_proc = DoubleToRawbits(qm);
17256 uint64_t sa_proc_n = sa_proc ^ kDSignMask;
17257 uint64_t sn_proc_n = sn_proc ^ kDSignMask;
17258 uint64_t qa_proc_n = qa_proc ^ kDSignMask;
17259 uint64_t qn_proc_n = qn_proc ^ kDSignMask;
17260
17261 // Quiet NaNs are propagated.
17262 double zn_inputs_1[] = {qn, 0.0, 0.0, qm, qn, qm};
17263 double zm_inputs_1[] = {0.0, qn, qm, 0.0, qm, qn};
17264 uint64_t zd_expected_1[] =
17265 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17266
17267 ProcessNaNsHelper(config,
17268 kDRegSize,
17269 zn_inputs_1,
17270 zm_inputs_1,
17271 zd_expected_1,
17272 StrictNaNPropagation);
17273
17274 // Signalling NaNs are propagated.
17275 double zn_inputs_2[] = {sn, 0.0, 0.0, sm, sn, sm};
17276 double zm_inputs_2[] = {0.0, sn, sm, 0.0, sm, sn};
17277 uint64_t zd_expected_2[] =
17278 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17279 ProcessNaNsHelper(config,
17280 kDRegSize,
17281 zn_inputs_2,
17282 zm_inputs_2,
17283 zd_expected_2,
17284 StrictNaNPropagation);
17285
17286 // Signalling NaNs take precedence over quiet NaNs.
17287 double zn_inputs_3[] = {sn, qn, sn, sn, qn};
17288 double zm_inputs_3[] = {qm, sm, sm, qn, sn};
17289 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17290 ProcessNaNsHelper(config,
17291 kDRegSize,
17292 zn_inputs_3,
17293 zm_inputs_3,
17294 zd_expected_3,
17295 StrictNaNPropagation);
17296
17297 double za_inputs_4[] = {qa, qa, 0.0, 0.0, qa, qa};
17298 double zn_inputs_4[] = {qn, 0.0, 0.0, qn, qn, qn};
17299 double zm_inputs_4[] = {0.0, qm, qm, qm, qm, 0.0};
17300
17301 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17302 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17303 // If `m` is propagated, its sign is never inverted.
17304 uint64_t zd_expected_fmla_4[] =
17305 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17306 uint64_t zd_expected_fmls_4[] =
17307 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17308 uint64_t zd_expected_fnmla_4[] =
17309 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17310 uint64_t zd_expected_fnmls_4[] =
17311 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17312
17313 ProcessNaNsHelper3(config,
17314 kDRegSize,
17315 za_inputs_4,
17316 zn_inputs_4,
17317 zm_inputs_4,
17318 zd_expected_fmla_4,
17319 zd_expected_fmls_4,
17320 zd_expected_fnmla_4,
17321 zd_expected_fnmls_4,
17322 StrictNaNPropagation);
17323
17324 // Signalling NaNs take precedence over quiet NaNs.
17325 double za_inputs_5[] = {qa, qa, sa, sa, sa};
17326 double zn_inputs_5[] = {qn, sn, sn, sn, qn};
17327 double zm_inputs_5[] = {sm, qm, sm, qa, sm};
17328 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17329 uint64_t zd_expected_fmls_5[] = {sm_proc,
17330 sn_proc_n,
17331 sa_proc,
17332 sa_proc,
17333 sa_proc};
17334 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17335 sn_proc_n,
17336 sa_proc_n,
17337 sa_proc_n,
17338 sa_proc_n};
17339 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17340 sn_proc,
17341 sa_proc_n,
17342 sa_proc_n,
17343 sa_proc_n};
17344
17345 ProcessNaNsHelper3(config,
17346 kDRegSize,
17347 za_inputs_5,
17348 zn_inputs_5,
17349 zm_inputs_5,
17350 zd_expected_fmla_5,
17351 zd_expected_fmls_5,
17352 zd_expected_fnmla_5,
17353 zd_expected_fnmls_5,
17354 StrictNaNPropagation);
17355
17356 const double inf = kFP64PositiveInfinity;
17357 const double inf_n = kFP64NegativeInfinity;
17358 uint64_t inf_proc = DoubleToRawbits(inf);
17359 uint64_t inf_proc_n = DoubleToRawbits(inf_n);
17360 uint64_t d_inf_proc = DoubleToRawbits(kFP64DefaultNaN);
17361
17362 double za_inputs_6[] = {qa, qa, 0.0f, -0.0f, qa, sa};
17363 double zn_inputs_6[] = {inf, -0.0f, -0.0f, inf, inf_n, inf};
17364 double zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17365
17366 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17367 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17368 // quiet_nan.
17369 uint64_t zd_expected_fmla_6[] =
17370 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17371 uint64_t zd_expected_fmls_6[] =
17372 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17373 uint64_t zd_expected_fnmla_6[] =
17374 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17375 uint64_t zd_expected_fnmls_6[] =
17376 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17377
17378 ProcessNaNsHelper3(config,
17379 kDRegSize,
17380 za_inputs_6,
17381 zn_inputs_6,
17382 zm_inputs_6,
17383 zd_expected_fmla_6,
17384 zd_expected_fmls_6,
17385 zd_expected_fnmla_6,
17386 zd_expected_fnmls_6,
17387 StrictNaNPropagation);
17388}
17389
17390TEST_SVE(sve_process_nans_float) {
17391 // Use non-standard NaNs to check that the payload bits are preserved.
17392 float sa = RawbitsToFloat(0x7f951111);
17393 float sn = RawbitsToFloat(0x7f952222);
17394 float sm = RawbitsToFloat(0x7f953333);
17395 float qa = RawbitsToFloat(0x7fea1111);
17396 float qn = RawbitsToFloat(0x7fea2222);
17397 float qm = RawbitsToFloat(0x7fea3333);
17398 VIXL_ASSERT(IsSignallingNaN(sa));
17399 VIXL_ASSERT(IsSignallingNaN(sn));
17400 VIXL_ASSERT(IsSignallingNaN(sm));
17401 VIXL_ASSERT(IsQuietNaN(qa));
17402 VIXL_ASSERT(IsQuietNaN(qn));
17403 VIXL_ASSERT(IsQuietNaN(qm));
17404
17405 // The input NaNs after passing through ProcessNaN.
17406 uint32_t sa_proc = 0x7fd51111;
17407 uint32_t sn_proc = 0x7fd52222;
17408 uint32_t sm_proc = 0x7fd53333;
17409 uint32_t qa_proc = FloatToRawbits(qa);
17410 uint32_t qn_proc = FloatToRawbits(qn);
17411 uint32_t qm_proc = FloatToRawbits(qm);
17412 uint32_t sa_proc_n = sa_proc ^ kSSignMask;
17413 uint32_t sn_proc_n = sn_proc ^ kSSignMask;
17414 uint32_t qa_proc_n = qa_proc ^ kSSignMask;
17415 uint32_t qn_proc_n = qn_proc ^ kSSignMask;
17416
17417 // Quiet NaNs are propagated.
17418 float zn_inputs_1[] = {qn, 0.0f, 0.0f, qm, qn, qm};
17419 float zm_inputs_1[] = {0.0f, qn, qm, 0.0f, qm, qn};
17420 uint64_t zd_expected_1[] =
17421 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17422
17423 ProcessNaNsHelper(config,
17424 kSRegSize,
17425 zn_inputs_1,
17426 zm_inputs_1,
17427 zd_expected_1,
17428 StrictNaNPropagation);
17429
17430 // Signalling NaNs are propagated.
17431 float zn_inputs_2[] = {sn, 0.0f, 0.0f, sm, sn, sm};
17432 float zm_inputs_2[] = {0.0f, sn, sm, 0.0f, sm, sn};
17433 uint64_t zd_expected_2[] =
17434 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17435 ProcessNaNsHelper(config,
17436 kSRegSize,
17437 zn_inputs_2,
17438 zm_inputs_2,
17439 zd_expected_2,
17440 StrictNaNPropagation);
17441
17442 // Signalling NaNs take precedence over quiet NaNs.
17443 float zn_inputs_3[] = {sn, qn, sn, sn, qn};
17444 float zm_inputs_3[] = {qm, sm, sm, qn, sn};
17445 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17446 ProcessNaNsHelper(config,
17447 kSRegSize,
17448 zn_inputs_3,
17449 zm_inputs_3,
17450 zd_expected_3,
17451 StrictNaNPropagation);
17452
17453 float za_inputs_4[] = {qa, qa, 0.0f, 0.0f, qa, qa};
17454 float zn_inputs_4[] = {qn, 0.0f, 0.0f, qn, qn, qn};
17455 float zm_inputs_4[] = {0.0f, qm, qm, qm, qm, 0.0f};
17456
17457 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17458 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17459 // If `m` is propagated, its sign is never inverted.
17460 uint64_t zd_expected_fmla_4[] =
17461 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17462 uint64_t zd_expected_fmls_4[] =
17463 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17464 uint64_t zd_expected_fnmla_4[] =
17465 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17466 uint64_t zd_expected_fnmls_4[] =
17467 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17468
17469 ProcessNaNsHelper3(config,
17470 kSRegSize,
17471 za_inputs_4,
17472 zn_inputs_4,
17473 zm_inputs_4,
17474 zd_expected_fmla_4,
17475 zd_expected_fmls_4,
17476 zd_expected_fnmla_4,
17477 zd_expected_fnmls_4,
17478 StrictNaNPropagation);
17479
17480 // Signalling NaNs take precedence over quiet NaNs.
17481 float za_inputs_5[] = {qa, qa, sa, sa, sa};
17482 float zn_inputs_5[] = {qn, sn, sn, sn, qn};
17483 float zm_inputs_5[] = {sm, qm, sm, qa, sm};
17484 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17485 uint64_t zd_expected_fmls_5[] = {sm_proc,
17486 sn_proc_n,
17487 sa_proc,
17488 sa_proc,
17489 sa_proc};
17490 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17491 sn_proc_n,
17492 sa_proc_n,
17493 sa_proc_n,
17494 sa_proc_n};
17495 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17496 sn_proc,
17497 sa_proc_n,
17498 sa_proc_n,
17499 sa_proc_n};
17500
17501 ProcessNaNsHelper3(config,
17502 kSRegSize,
17503 za_inputs_5,
17504 zn_inputs_5,
17505 zm_inputs_5,
17506 zd_expected_fmla_5,
17507 zd_expected_fmls_5,
17508 zd_expected_fnmla_5,
17509 zd_expected_fnmls_5,
17510 StrictNaNPropagation);
17511
17512 const float inf = kFP32PositiveInfinity;
17513 const float inf_n = kFP32NegativeInfinity;
17514 uint32_t inf_proc = FloatToRawbits(inf);
17515 uint32_t inf_proc_n = FloatToRawbits(inf_n);
17516 uint32_t d_inf_proc = FloatToRawbits(kFP32DefaultNaN);
17517
17518 float za_inputs_6[] = {qa, qa, 0.0f, 0.0f, qa, sa};
17519 float zn_inputs_6[] = {inf, 0.0f, 0.0f, inf, inf_n, inf};
17520 float zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17521
17522 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17523 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17524 // quiet_nan.
17525 uint64_t zd_expected_fmla_6[] =
17526 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17527 uint64_t zd_expected_fmls_6[] =
17528 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17529 uint64_t zd_expected_fnmla_6[] =
17530 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17531 uint64_t zd_expected_fnmls_6[] =
17532 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17533
17534 ProcessNaNsHelper3(config,
17535 kSRegSize,
17536 za_inputs_6,
17537 zn_inputs_6,
17538 zm_inputs_6,
17539 zd_expected_fmla_6,
17540 zd_expected_fmls_6,
17541 zd_expected_fnmla_6,
17542 zd_expected_fnmls_6,
17543 StrictNaNPropagation);
17544}
17545
17546TEST_SVE(sve_process_nans_half) {
17547 // Use non-standard NaNs to check that the payload bits are preserved.
17548 Float16 sa(RawbitsToFloat16(0x7c11));
17549 Float16 sn(RawbitsToFloat16(0x7c22));
17550 Float16 sm(RawbitsToFloat16(0x7c33));
17551 Float16 qa(RawbitsToFloat16(0x7e44));
17552 Float16 qn(RawbitsToFloat16(0x7e55));
17553 Float16 qm(RawbitsToFloat16(0x7e66));
17554 VIXL_ASSERT(IsSignallingNaN(sa));
17555 VIXL_ASSERT(IsSignallingNaN(sn));
17556 VIXL_ASSERT(IsSignallingNaN(sm));
17557 VIXL_ASSERT(IsQuietNaN(qa));
17558 VIXL_ASSERT(IsQuietNaN(qn));
17559 VIXL_ASSERT(IsQuietNaN(qm));
17560
17561 // The input NaNs after passing through ProcessNaN.
17562 uint16_t sa_proc = 0x7e11;
17563 uint16_t sn_proc = 0x7e22;
17564 uint16_t sm_proc = 0x7e33;
17565 uint16_t qa_proc = Float16ToRawbits(qa);
17566 uint16_t qn_proc = Float16ToRawbits(qn);
17567 uint16_t qm_proc = Float16ToRawbits(qm);
17568 uint16_t sa_proc_n = sa_proc ^ kHSignMask;
17569 uint16_t sn_proc_n = sn_proc ^ kHSignMask;
17570 uint16_t qa_proc_n = qa_proc ^ kHSignMask;
17571 uint16_t qn_proc_n = qn_proc ^ kHSignMask;
17572 Float16 zero(0.0);
17573
17574 // Quiet NaNs are propagated.
17575 Float16 zn_inputs_1[] = {qn, zero, zero, qm, qn, qm};
17576 Float16 zm_inputs_1[] = {zero, qn, qm, zero, qm, qn};
17577 uint64_t zd_expected_1[] =
17578 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17579
17580 ProcessNaNsHelper(config,
17581 kHRegSize,
17582 zn_inputs_1,
17583 zm_inputs_1,
17584 zd_expected_1,
17585 StrictNaNPropagation);
17586
17587 // Signalling NaNs are propagated.
17588 Float16 zn_inputs_2[] = {sn, zero, zero, sm, sn, sm};
17589 Float16 zm_inputs_2[] = {zero, sn, sm, zero, sm, sn};
17590 uint64_t zd_expected_2[] =
17591 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17592 ProcessNaNsHelper(config,
17593 kHRegSize,
17594 zn_inputs_2,
17595 zm_inputs_2,
17596 zd_expected_2,
17597 StrictNaNPropagation);
17598
17599 // Signalling NaNs take precedence over quiet NaNs.
17600 Float16 zn_inputs_3[] = {sn, qn, sn, sn, qn};
17601 Float16 zm_inputs_3[] = {qm, sm, sm, qn, sn};
17602 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17603 ProcessNaNsHelper(config,
17604 kHRegSize,
17605 zn_inputs_3,
17606 zm_inputs_3,
17607 zd_expected_3,
17608 StrictNaNPropagation);
17609
17610 Float16 za_inputs_4[] = {qa, qa, zero, zero, qa, qa};
17611 Float16 zn_inputs_4[] = {qn, zero, zero, qn, qn, qn};
17612 Float16 zm_inputs_4[] = {zero, qm, qm, qm, qm, zero};
17613
17614 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17615 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17616 // If `m` is propagated, its sign is never inverted.
17617 uint64_t zd_expected_fmla_4[] =
17618 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17619 uint64_t zd_expected_fmls_4[] =
17620 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17621 uint64_t zd_expected_fnmla_4[] =
17622 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17623 uint64_t zd_expected_fnmls_4[] =
17624 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17625
17626 ProcessNaNsHelper3(config,
17627 kHRegSize,
17628 za_inputs_4,
17629 zn_inputs_4,
17630 zm_inputs_4,
17631 zd_expected_fmla_4,
17632 zd_expected_fmls_4,
17633 zd_expected_fnmla_4,
17634 zd_expected_fnmls_4,
17635 StrictNaNPropagation);
17636
17637 // Signalling NaNs take precedence over quiet NaNs.
17638 Float16 za_inputs_5[] = {qa, qa, sa, sa, sa};
17639 Float16 zn_inputs_5[] = {qn, sn, sn, sn, qn};
17640 Float16 zm_inputs_5[] = {sm, qm, sm, qa, sm};
17641 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17642 uint64_t zd_expected_fmls_5[] = {sm_proc,
17643 sn_proc_n,
17644 sa_proc,
17645 sa_proc,
17646 sa_proc};
17647 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17648 sn_proc_n,
17649 sa_proc_n,
17650 sa_proc_n,
17651 sa_proc_n};
17652 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17653 sn_proc,
17654 sa_proc_n,
17655 sa_proc_n,
17656 sa_proc_n};
17657
17658 ProcessNaNsHelper3(config,
17659 kHRegSize,
17660 za_inputs_5,
17661 zn_inputs_5,
17662 zm_inputs_5,
17663 zd_expected_fmla_5,
17664 zd_expected_fmls_5,
17665 zd_expected_fnmla_5,
17666 zd_expected_fnmls_5,
17667 StrictNaNPropagation);
17668
17669 const Float16 inf = kFP16PositiveInfinity;
17670 const Float16 inf_n = kFP16NegativeInfinity;
17671 uint64_t inf_proc = Float16ToRawbits(inf);
17672 uint64_t inf_proc_n = Float16ToRawbits(inf_n);
17673 uint64_t d_inf_proc = Float16ToRawbits(kFP16DefaultNaN);
17674
17675 Float16 za_inputs_6[] = {qa, qa, zero, zero, qa, sa};
17676 Float16 zn_inputs_6[] = {inf, zero, zero, inf, inf_n, inf};
17677 Float16 zm_inputs_6[] = {zero, inf_n, inf, inf, inf, zero};
17678
17679 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17680 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17681 // quiet_nan.
17682 uint64_t zd_expected_fmla_6[] =
17683 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17684 uint64_t zd_expected_fmls_6[] =
17685 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17686 uint64_t zd_expected_fnmla_6[] =
17687 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17688 uint64_t zd_expected_fnmls_6[] =
17689 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17690
17691 ProcessNaNsHelper3(config,
17692 kHRegSize,
17693 za_inputs_6,
17694 zn_inputs_6,
17695 zm_inputs_6,
17696 zd_expected_fmla_6,
17697 zd_expected_fmls_6,
17698 zd_expected_fnmla_6,
17699 zd_expected_fnmls_6,
17700 StrictNaNPropagation);
17701}
17702
TatWai Chong47c26842020-02-10 01:51:32 -080017703typedef void (MacroAssembler::*FCmpFn)(const PRegisterWithLaneSize& pd,
17704 const PRegisterZ& pg,
17705 const ZRegister& zn,
17706 const ZRegister& zm);
17707
TatWai Chonge3775132020-02-16 22:13:17 -080017708typedef void (MacroAssembler::*FCmpZeroFn)(const PRegisterWithLaneSize& pd,
17709 const PRegisterZ& pg,
Jacob Bramley5a5e71f2020-07-02 13:54:58 +010017710 const ZRegister& zn,
17711 double zero);
TatWai Chonge3775132020-02-16 22:13:17 -080017712
TatWai Chong47c26842020-02-10 01:51:32 -080017713typedef void (MacroAssembler::*CmpFn)(const PRegisterWithLaneSize& pd,
17714 const PRegisterZ& pg,
17715 const ZRegister& zn,
17716 const ZRegister& zm);
17717
17718static FCmpFn GetFpAbsCompareFn(Condition cond) {
17719 switch (cond) {
17720 case ge:
17721 return &MacroAssembler::Facge;
17722 case gt:
17723 return &MacroAssembler::Facgt;
17724 case le:
17725 return &MacroAssembler::Facle;
17726 case lt:
17727 return &MacroAssembler::Faclt;
17728 default:
17729 VIXL_UNIMPLEMENTED();
17730 return NULL;
17731 }
17732}
17733
17734static FCmpFn GetFpCompareFn(Condition cond) {
17735 switch (cond) {
17736 case ge:
17737 return &MacroAssembler::Fcmge;
17738 case gt:
17739 return &MacroAssembler::Fcmgt;
17740 case le:
17741 return &MacroAssembler::Fcmle;
17742 case lt:
17743 return &MacroAssembler::Fcmlt;
17744 case eq:
17745 return &MacroAssembler::Fcmeq;
17746 case ne:
17747 return &MacroAssembler::Fcmne;
17748 case uo:
17749 return &MacroAssembler::Fcmuo;
17750 default:
17751 VIXL_UNIMPLEMENTED();
17752 return NULL;
17753 }
17754}
17755
TatWai Chonge3775132020-02-16 22:13:17 -080017756static FCmpZeroFn GetFpCompareZeroFn(Condition cond) {
17757 switch (cond) {
17758 case ge:
17759 return &MacroAssembler::Fcmge;
17760 case gt:
17761 return &MacroAssembler::Fcmgt;
17762 case le:
17763 return &MacroAssembler::Fcmle;
17764 case lt:
17765 return &MacroAssembler::Fcmlt;
17766 case eq:
17767 return &MacroAssembler::Fcmeq;
17768 case ne:
17769 return &MacroAssembler::Fcmne;
17770 default:
17771 VIXL_UNIMPLEMENTED();
17772 return NULL;
17773 }
17774}
17775
TatWai Chong47c26842020-02-10 01:51:32 -080017776static CmpFn GetIntCompareFn(Condition cond) {
17777 switch (cond) {
17778 case ge:
17779 return &MacroAssembler::Cmpge;
17780 case gt:
17781 return &MacroAssembler::Cmpgt;
17782 case le:
17783 return &MacroAssembler::Cmple;
17784 case lt:
17785 return &MacroAssembler::Cmplt;
17786 case eq:
17787 return &MacroAssembler::Cmpeq;
17788 case ne:
17789 return &MacroAssembler::Cmpne;
17790 default:
17791 VIXL_UNIMPLEMENTED();
17792 return NULL;
17793 }
17794}
17795
17796template <size_t N>
17797static void TestFpCompareHelper(Test* config,
17798 int lane_size_in_bits,
17799 Condition cond,
17800 const double (&zn_inputs)[N],
17801 const double (&zm_inputs)[N],
17802 const int (&pd_expected)[N],
17803 bool is_absolute = false) {
17804 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17805 START();
17806
17807 ZRegister zt_int_1 = z1.WithLaneSize(lane_size_in_bits);
17808 ZRegister zt_int_2 = z2.WithLaneSize(lane_size_in_bits);
17809 ZRegister zt_int_3 = z3.WithLaneSize(lane_size_in_bits);
17810 ZRegister zt_fp_1 = z11.WithLaneSize(lane_size_in_bits);
17811 ZRegister zt_fp_2 = z12.WithLaneSize(lane_size_in_bits);
17812 ZRegister zt_fp_3 = z13.WithLaneSize(lane_size_in_bits);
17813 ZRegister fp_one = z31.WithLaneSize(lane_size_in_bits);
17814
17815 PRegisterWithLaneSize pd_result_int_1 = p15.WithLaneSize(lane_size_in_bits);
17816 PRegisterWithLaneSize pd_result_fp_1 = p14.WithLaneSize(lane_size_in_bits);
17817 PRegisterWithLaneSize pd_result_int_2 = p13.WithLaneSize(lane_size_in_bits);
17818 PRegisterWithLaneSize pd_result_fp_2 = p12.WithLaneSize(lane_size_in_bits);
17819
17820 FCmpFn fcmp = is_absolute ? GetFpAbsCompareFn(cond) : GetFpCompareFn(cond);
17821 __ Ptrue(p1.VnB());
17822
17823 if (cond != uo) {
17824 int pg_inputs[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1};
17825 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
17826
17827 __ Fdup(fp_one, 0.1f);
17828
17829 __ Index(zt_int_1, 3, 3);
17830 __ Scvtf(zt_fp_1, p0.Merging(), zt_int_1);
17831 __ Fadd(zt_fp_1, zt_fp_1, fp_one);
17832
17833 __ Index(zt_int_2, 3, -10);
17834 __ Scvtf(zt_fp_2, p0.Merging(), zt_int_2);
17835 __ Fadd(zt_fp_2, zt_fp_2, fp_one);
17836
17837 __ Index(zt_int_3, 3, 2);
17838 __ Scvtf(zt_fp_3, p0.Merging(), zt_int_3);
17839 __ Fadd(zt_fp_3, zt_fp_3, fp_one);
17840
17841
17842 // There is no absolute comparison in integer type, use `abs` with `cmp<cc>`
17843 // to synthesize the expected result for `fac<cc>`.
17844 if (is_absolute == true) {
17845 __ Abs(zt_int_2, p1.Merging(), zt_int_2);
17846 }
17847
17848 CmpFn cmp = GetIntCompareFn(cond);
17849 (masm.*cmp)(pd_result_int_1, p0.Zeroing(), zt_int_1, zt_int_2);
17850 (masm.*fcmp)(pd_result_fp_1, p0.Zeroing(), zt_fp_1, zt_fp_2);
17851
17852 (masm.*cmp)(pd_result_int_2, p0.Zeroing(), zt_int_1, zt_int_3);
17853 (masm.*fcmp)(pd_result_fp_2, p0.Zeroing(), zt_fp_1, zt_fp_3);
17854 }
17855
17856 uint64_t zn_inputs_rawbits[N];
17857 uint64_t zm_inputs_rawbits[N];
17858 FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
17859 FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
17860
17861 ZRegister zn_fp = z14.WithLaneSize(lane_size_in_bits);
17862 ZRegister zm_fp = z15.WithLaneSize(lane_size_in_bits);
17863 InsrHelper(&masm, zn_fp, zn_inputs_rawbits);
17864 InsrHelper(&masm, zm_fp, zm_inputs_rawbits);
17865
17866 PRegisterWithLaneSize pd_result_fp_3 = p11.WithLaneSize(lane_size_in_bits);
17867 (masm.*fcmp)(pd_result_fp_3, p1.Zeroing(), zn_fp, zm_fp);
17868
17869 END();
17870
17871 if (CAN_RUN()) {
17872 RUN();
17873
17874 if (cond != uo) {
17875 ASSERT_EQUAL_SVE(pd_result_int_1, pd_result_fp_1);
17876 ASSERT_EQUAL_SVE(pd_result_int_2, pd_result_fp_2);
17877 }
17878 ASSERT_EQUAL_SVE(pd_expected, pd_result_fp_3);
17879 }
17880}
17881
17882TEST_SVE(sve_fp_compare_vectors) {
17883 double inf_p = kFP64PositiveInfinity;
17884 double inf_n = kFP64NegativeInfinity;
17885 double nan = kFP64DefaultNaN;
17886
17887 // Normal floating point comparison has been tested in the helper.
17888 double zn[] = {0.0, inf_n, 1.0, inf_p, inf_p, nan, 0.0, nan};
17889 double zm[] = {-0.0, inf_n, inf_n, -2.0, inf_n, nan, nan, inf_p};
17890
17891 int pd_fcm_gt[] = {0, 0, 1, 1, 1, 0, 0, 0};
17892 int pd_fcm_lt[] = {0, 0, 0, 0, 0, 0, 0, 0};
17893 int pd_fcm_ge[] = {1, 1, 1, 1, 1, 0, 0, 0};
17894 int pd_fcm_le[] = {1, 1, 0, 0, 0, 0, 0, 0};
17895 int pd_fcm_eq[] = {1, 1, 0, 0, 0, 0, 0, 0};
Jacob Bramley4606adc2020-07-02 14:23:08 +010017896 int pd_fcm_ne[] = {0, 0, 1, 1, 1, 1, 1, 1};
TatWai Chong47c26842020-02-10 01:51:32 -080017897 int pd_fcm_uo[] = {0, 0, 0, 0, 0, 1, 1, 1};
17898 int pd_fac_gt[] = {0, 0, 0, 1, 0, 0, 0, 0};
17899 int pd_fac_lt[] = {0, 0, 1, 0, 0, 0, 0, 0};
17900 int pd_fac_ge[] = {1, 1, 0, 1, 1, 0, 0, 0};
17901 int pd_fac_le[] = {1, 1, 1, 0, 1, 0, 0, 0};
17902
17903 int lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
17904
17905 for (size_t i = 0; i < ArrayLength(lane_sizes); i++) {
17906 int lane_size = lane_sizes[i];
17907 // Test floating-point compare vectors.
17908 TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fcm_gt);
17909 TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fcm_lt);
17910 TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fcm_ge);
17911 TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fcm_le);
17912 TestFpCompareHelper(config, lane_size, eq, zn, zm, pd_fcm_eq);
17913 TestFpCompareHelper(config, lane_size, ne, zn, zm, pd_fcm_ne);
17914 TestFpCompareHelper(config, lane_size, uo, zn, zm, pd_fcm_uo);
17915
17916 // Test floating-point absolute compare vectors.
17917 TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fac_gt, true);
17918 TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fac_lt, true);
17919 TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fac_ge, true);
17920 TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fac_le, true);
17921 }
17922}
17923
TatWai Chonge3775132020-02-16 22:13:17 -080017924template <size_t N, typename T>
17925static void TestFpCompareZeroHelper(Test* config,
17926 int lane_size_in_bits,
17927 Condition cond,
17928 const T (&zn_inputs)[N],
17929 const int (&pd_expected)[N]) {
17930 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17931 START();
17932
17933 ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
17934 PRegisterWithLaneSize pd = p14.WithLaneSize(lane_size_in_bits);
17935
17936 uint64_t zn_rawbits[N];
17937 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
17938 InsrHelper(&masm, zn, zn_rawbits);
17939
17940 __ Ptrue(p0.VnB());
Jacob Bramley5a5e71f2020-07-02 13:54:58 +010017941 (masm.*GetFpCompareZeroFn(cond))(pd, p0.Zeroing(), zn, 0.0);
TatWai Chonge3775132020-02-16 22:13:17 -080017942
17943 END();
17944
17945 if (CAN_RUN()) {
17946 RUN();
17947
17948 ASSERT_EQUAL_SVE(pd_expected, pd);
17949 }
17950}
17951
17952TEST_SVE(sve_fp_compare_vector_zero) {
17953 Float16 fp16_inf_p = kFP16PositiveInfinity;
17954 Float16 fp16_inf_n = kFP16NegativeInfinity;
17955 Float16 fp16_dn = kFP16DefaultNaN;
17956 Float16 fp16_sn = RawbitsToFloat16(0x7c22);
17957 Float16 fp16_qn = RawbitsToFloat16(0x7e55);
17958
17959 float fp32_inf_p = kFP32PositiveInfinity;
17960 float fp32_inf_n = kFP32NegativeInfinity;
17961 float fp32_dn = kFP32DefaultNaN;
17962 float fp32_sn = RawbitsToFloat(0x7f952222);
17963 float fp32_qn = RawbitsToFloat(0x7fea2222);
17964
17965 double fp64_inf_p = kFP64PositiveInfinity;
17966 double fp64_inf_n = kFP64NegativeInfinity;
17967 double fp64_dn = kFP64DefaultNaN;
17968 double fp64_sn = RawbitsToDouble(0x7ff5555511111111);
17969 double fp64_qn = RawbitsToDouble(0x7ffaaaaa11111111);
17970
17971 // Normal floating point comparison has been tested in the non-zero form.
17972 Float16 zn_inputs_h[] = {Float16(0.0),
17973 Float16(-0.0),
17974 fp16_inf_p,
17975 fp16_inf_n,
17976 fp16_dn,
17977 fp16_sn,
17978 fp16_qn};
17979 float zn_inputs_s[] =
17980 {0.0, -0.0, fp32_inf_p, fp32_inf_n, fp32_dn, fp32_sn, fp32_qn};
17981 double zn_inputs_d[] =
17982 {0.0, -0.0, fp64_inf_p, fp64_inf_n, fp64_dn, fp64_sn, fp64_qn};
17983
17984 int pd_expected_gt[] = {0, 0, 1, 0, 0, 0, 0};
17985 int pd_expected_lt[] = {0, 0, 0, 1, 0, 0, 0};
17986 int pd_expected_ge[] = {1, 1, 1, 0, 0, 0, 0};
17987 int pd_expected_le[] = {1, 1, 0, 1, 0, 0, 0};
17988 int pd_expected_eq[] = {1, 1, 0, 0, 0, 0, 0};
Jacob Bramley4606adc2020-07-02 14:23:08 +010017989 int pd_expected_ne[] = {0, 0, 1, 1, 1, 1, 1};
TatWai Chonge3775132020-02-16 22:13:17 -080017990
17991 TestFpCompareZeroHelper(config, kDRegSize, gt, zn_inputs_d, pd_expected_gt);
17992 TestFpCompareZeroHelper(config, kDRegSize, lt, zn_inputs_d, pd_expected_lt);
17993 TestFpCompareZeroHelper(config, kDRegSize, ge, zn_inputs_d, pd_expected_ge);
17994 TestFpCompareZeroHelper(config, kDRegSize, le, zn_inputs_d, pd_expected_le);
17995 TestFpCompareZeroHelper(config, kDRegSize, eq, zn_inputs_d, pd_expected_eq);
17996 TestFpCompareZeroHelper(config, kDRegSize, ne, zn_inputs_d, pd_expected_ne);
17997
17998 TestFpCompareZeroHelper(config, kSRegSize, gt, zn_inputs_s, pd_expected_gt);
17999 TestFpCompareZeroHelper(config, kSRegSize, lt, zn_inputs_s, pd_expected_lt);
18000 TestFpCompareZeroHelper(config, kSRegSize, ge, zn_inputs_s, pd_expected_ge);
18001 TestFpCompareZeroHelper(config, kSRegSize, le, zn_inputs_s, pd_expected_le);
18002 TestFpCompareZeroHelper(config, kSRegSize, eq, zn_inputs_s, pd_expected_eq);
18003 TestFpCompareZeroHelper(config, kSRegSize, ne, zn_inputs_s, pd_expected_ne);
18004
18005 TestFpCompareZeroHelper(config, kHRegSize, gt, zn_inputs_h, pd_expected_gt);
18006 TestFpCompareZeroHelper(config, kHRegSize, lt, zn_inputs_h, pd_expected_lt);
18007 TestFpCompareZeroHelper(config, kHRegSize, ge, zn_inputs_h, pd_expected_ge);
18008 TestFpCompareZeroHelper(config, kHRegSize, le, zn_inputs_h, pd_expected_le);
18009 TestFpCompareZeroHelper(config, kHRegSize, eq, zn_inputs_h, pd_expected_eq);
18010 TestFpCompareZeroHelper(config, kHRegSize, ne, zn_inputs_h, pd_expected_ne);
18011}
18012
TatWai Chong2cb1b612020-03-04 23:51:21 -080018013typedef void (MacroAssembler::*FPUnaryMFn)(const ZRegister& zd,
18014 const PRegisterM& pg,
18015 const ZRegister& zn);
18016
18017typedef void (MacroAssembler::*FPUnaryZFn)(const ZRegister& zd,
18018 const PRegisterZ& pg,
18019 const ZRegister& zn);
18020
18021template <size_t N, size_t M>
18022static void TestFPUnaryPredicatedHelper(Test* config,
18023 int src_size_in_bits,
18024 int dst_size_in_bits,
18025 uint64_t (&zn_inputs)[N],
18026 const uint64_t (&pg_inputs)[M],
18027 const uint64_t (&zd_expected)[N],
18028 FPUnaryMFn macro_m,
18029 FPUnaryZFn macro_z) {
18030 // Provide the full predicate input.
18031 VIXL_ASSERT(M == (kPRegMaxSize / kDRegSize));
18032 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18033 START();
18034
18035 int ds = dst_size_in_bits;
18036 int ss = src_size_in_bits;
18037 int ls = std::max(ss, ds);
18038
18039 // When destination type is larger than source type, fill the high parts with
18040 // noise values, which should be ignored.
18041 if (ds > ss) {
18042 VIXL_ASSERT(ss < 64);
18043 uint64_t zn_inputs_mod[N];
18044 uint64_t sn = GetSignallingNan(ss);
18045 for (unsigned i = 0; i < N; i++) {
18046 zn_inputs_mod[i] = zn_inputs[i] | ((sn + i) << ss);
18047 }
18048 InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs_mod);
18049 } else {
18050 InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs);
18051 }
18052
18053 // Make a copy so we can check that constructive operations preserve zn.
18054 __ Mov(z28, z29);
18055
18056 // Run the operation on all lanes.
18057 __ Ptrue(p0.WithLaneSize(ls));
18058 (masm.*macro_m)(z27.WithLaneSize(ds), p0.Merging(), z28.WithLaneSize(ss));
18059
18060 Initialise(&masm,
18061 p1.VnB(),
18062 pg_inputs[3],
18063 pg_inputs[2],
18064 pg_inputs[1],
18065 pg_inputs[0]);
18066
18067 // Clear the irrelevant lanes.
18068 __ Index(z31.WithLaneSize(ls), 0, 1);
18069 __ Cmplt(p1.WithLaneSize(ls), p1.Zeroing(), z31.WithLaneSize(ls), N);
18070
18071 // Check merging predication.
18072 __ Index(z11.WithLaneSize(ls), 42, 1);
18073 // Preserve the base value so we can derive the expected result.
18074 __ Mov(z21, z11);
18075 __ Mov(z9, z11);
18076 (masm.*macro_m)(z11.WithLaneSize(ds), p1.Merging(), z28.WithLaneSize(ss));
18077
18078 // Generate expected values using explicit merging operations.
18079 InsrHelper(&masm, z25.WithLaneSize(ls), zd_expected);
18080 __ Mov(z21.WithLaneSize(ls), p1.Merging(), z25.WithLaneSize(ls));
18081
18082 // Check zeroing predication.
18083 __ Index(z12.WithLaneSize(ds), 42, -1);
18084 (masm.*macro_z)(z12.WithLaneSize(ds), p1.Zeroing(), z28.WithLaneSize(ss));
18085
18086 // Generate expected values using explicit zeroing operations.
18087 InsrHelper(&masm, z30.WithLaneSize(ls), zd_expected);
18088 // Emulate zeroing predication.
18089 __ Dup(z22.WithLaneSize(ls), 0);
18090 __ Mov(z22.WithLaneSize(ls), p1.Merging(), z30.WithLaneSize(ls));
18091
18092 // Check an in-place update.
18093 __ Mov(z9.WithLaneSize(ls), p1.Merging(), z28.WithLaneSize(ls));
18094 (masm.*macro_m)(z9.WithLaneSize(ds), p1.Merging(), z9.WithLaneSize(ss));
18095
18096 END();
18097
18098 if (CAN_RUN()) {
18099 RUN();
18100
18101 // Check all lanes.
18102 ASSERT_EQUAL_SVE(zd_expected, z27.WithLaneSize(ls));
18103
18104 // Check that constructive operations preserve their inputs.
18105 ASSERT_EQUAL_SVE(z28, z29);
18106
18107 // Check merging predication.
18108 ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z21.WithLaneSize(ls));
18109
18110 // Check zeroing predication.
18111 ASSERT_EQUAL_SVE(z22.WithLaneSize(ls), z12.WithLaneSize(ls));
18112
18113 // Check in-place operation where zd == zn.
18114 ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z9.WithLaneSize(ls));
18115 }
18116}
18117
18118template <size_t N, typename T>
18119static void TestFPUnaryPredicatedHelper(Test* config,
18120 int src_size_in_bits,
18121 int dst_size_in_bits,
18122 T (&zn_inputs)[N],
18123 const T (&zd_expected)[N],
18124 FPUnaryMFn macro_m,
18125 FPUnaryZFn macro_z) {
18126 uint64_t pg_inputs[] = {0xa55aa55aa55aa55a,
18127 0xa55aa55aa55aa55a,
18128 0xa55aa55aa55aa55a,
18129 0xa55aa55aa55aa55a};
18130
18131 TestFPUnaryPredicatedHelper(config,
18132 src_size_in_bits,
18133 dst_size_in_bits,
18134 zn_inputs,
18135 pg_inputs,
18136 zd_expected,
18137 macro_m,
18138 macro_z);
18139
18140 // The complementary of above precicate to get full input coverage.
18141 uint64_t pg_c_inputs[] = {0x5aa55aa55aa55aa5,
18142 0x5aa55aa55aa55aa5,
18143 0x5aa55aa55aa55aa5,
18144 0x5aa55aa55aa55aa5};
18145
18146 TestFPUnaryPredicatedHelper(config,
18147 src_size_in_bits,
18148 dst_size_in_bits,
18149 zn_inputs,
18150 pg_c_inputs,
18151 zd_expected,
18152 macro_m,
18153 macro_z);
18154}
18155
18156template <size_t N, typename T>
18157static void TestFcvtHelper(Test* config,
18158 int src_size_in_bits,
18159 int dst_size_in_bits,
18160 T (&zn_inputs)[N],
18161 const T (&zd_expected)[N]) {
18162 TestFPUnaryPredicatedHelper(config,
18163 src_size_in_bits,
18164 dst_size_in_bits,
18165 zn_inputs,
18166 zd_expected,
18167 &MacroAssembler::Fcvt, // Merging form.
18168 &MacroAssembler::Fcvt); // Zerging form.
18169}
18170
18171TEST_SVE(sve_fcvt) {
18172 uint64_t h_vals[] = {0x7c00,
18173 0xfc00,
18174 0,
18175 0x8000,
18176 0x7bff, // Max half precision.
18177 0x0400, // Min positive normal.
18178 0x03ff, // Max subnormal.
18179 0x0001}; // Min positive subnormal.
18180
18181 uint64_t s_vals[] = {0x7f800000,
18182 0xff800000,
18183 0,
18184 0x80000000,
18185 0x477fe000,
18186 0x38800000,
18187 0x387fc000,
18188 0x33800000};
18189
18190 uint64_t d_vals[] = {0x7ff0000000000000,
18191 0xfff0000000000000,
18192 0,
18193 0x8000000000000000,
18194 0x40effc0000000000,
18195 0x3f10000000000000,
18196 0x3f0ff80000000000,
18197 0x3e70000000000000};
18198
18199 TestFcvtHelper(config, kHRegSize, kSRegSize, h_vals, s_vals);
18200 TestFcvtHelper(config, kSRegSize, kHRegSize, s_vals, h_vals);
18201 TestFcvtHelper(config, kSRegSize, kDRegSize, s_vals, d_vals);
18202 TestFcvtHelper(config, kDRegSize, kSRegSize, d_vals, s_vals);
18203 TestFcvtHelper(config, kHRegSize, kDRegSize, h_vals, d_vals);
18204 TestFcvtHelper(config, kDRegSize, kHRegSize, d_vals, h_vals);
18205}
18206
18207TEST_SVE(sve_fcvt_nan) {
18208 uint64_t h_inputs[] = {0x7e55, // Quiet NaN.
18209 0x7c22}; // Signalling NaN.
18210
18211 uint64_t h2s_expected[] = {0x7fcaa000, 0x7fc44000};
18212
18213 uint64_t h2d_expected[] = {0x7ff9540000000000, 0x7ff8880000000000};
18214
18215 uint64_t s_inputs[] = {0x7fc12345, // Quiet NaN.
18216 0x7f812345}; // Signalling NaN.
18217
18218 uint64_t s2h_expected[] = {0x7e09, 0x7e09};
18219
18220 uint64_t s2d_expected[] = {0x7ff82468a0000000, 0x7ff82468a0000000};
18221
18222 uint64_t d_inputs[] = {0x7ffaaaaa22222222, // Quiet NaN.
18223 0x7ff5555511111111}; // Signalling NaN.
18224
18225 uint64_t d2h_expected[] = {0x7eaa, 0x7f55};
18226
18227 uint64_t d2s_expected[] = {0x7fd55551, 0x7feaaaa8};
18228
18229 TestFcvtHelper(config, kHRegSize, kSRegSize, h_inputs, h2s_expected);
18230 TestFcvtHelper(config, kSRegSize, kHRegSize, s_inputs, s2h_expected);
18231 TestFcvtHelper(config, kHRegSize, kDRegSize, h_inputs, h2d_expected);
18232 TestFcvtHelper(config, kDRegSize, kHRegSize, d_inputs, d2h_expected);
18233 TestFcvtHelper(config, kSRegSize, kDRegSize, s_inputs, s2d_expected);
18234 TestFcvtHelper(config, kDRegSize, kSRegSize, d_inputs, d2s_expected);
18235}
18236
TatWai Chongf60f6dc2020-02-21 10:48:11 -080018237template <size_t N, typename T>
18238static void TestFrecpxHelper(Test* config,
18239 int lane_size_in_bits,
18240 T (&zn_inputs)[N],
18241 const T (&zd_expected)[N]) {
18242 TestFPUnaryPredicatedHelper(config,
18243 lane_size_in_bits,
18244 lane_size_in_bits,
18245 zn_inputs,
18246 zd_expected,
18247 &MacroAssembler::Frecpx, // Merging form.
18248 &MacroAssembler::Frecpx); // Zerging form.
18249}
18250
18251TEST_SVE(sve_frecpx_h) {
18252 uint64_t zn_inputs[] = {Float16ToRawbits(kFP16PositiveInfinity),
18253 Float16ToRawbits(kFP16NegativeInfinity),
18254 Float16ToRawbits(Float16(0.0)),
18255 Float16ToRawbits(Float16(-0.0)),
18256 0x0001, // Smallest positive subnormal number.
18257 0x03ff, // Largest subnormal number.
18258 0x0400, // Smallest positive normal number.
18259 0x7bff, // Largest normal number.
18260 0x3bff, // Largest number less than one.
18261 0x3c01, // Smallest number larger than one.
18262 0x7c22, // Signalling NaN.
18263 0x7e55}; // Quiet NaN.
18264
18265 uint64_t zd_expected[] = {0,
18266 0x8000,
18267 0x7800,
18268 0xf800,
18269 // Exponent of subnormal numbers are zero.
18270 0x7800,
18271 0x7800,
18272 0x7800,
18273 0x0400,
18274 0x4400,
18275 0x4000,
18276 0x7e22, // To quiet NaN.
18277 0x7e55};
18278
18279 TestFrecpxHelper(config, kHRegSize, zn_inputs, zd_expected);
18280}
18281
18282TEST_SVE(sve_frecpx_s) {
18283 uint64_t zn_inputs[] = {FloatToRawbits(kFP32PositiveInfinity),
18284 FloatToRawbits(kFP32NegativeInfinity),
18285 FloatToRawbits(65504), // Max half precision.
18286 FloatToRawbits(6.10352e-5), // Min positive normal.
18287 FloatToRawbits(6.09756e-5), // Max subnormal.
18288 FloatToRawbits(
18289 5.96046e-8), // Min positive subnormal.
18290 FloatToRawbits(5e-9), // Not representable -> zero.
18291 FloatToRawbits(-0.0),
18292 FloatToRawbits(0.0),
18293 0x7f952222, // Signalling NaN.
18294 0x7fea2222}; // Quiet NaN;
18295
18296 uint64_t zd_expected[] = {0, // 0.0
18297 0x80000000, // -0.0
18298 0x38800000, // 6.10352e-05
18299 0x47000000, // 32768
18300 0x47800000, // 65536
18301 0x4c800000, // 6.71089e+07
18302 0x4e000000, // 5.36871e+08
18303 0xff000000, // -1.70141e+38
18304 0x7f000000, // 1.70141e+38
18305 0x7fd52222,
18306 0x7fea2222};
18307
18308 TestFrecpxHelper(config, kSRegSize, zn_inputs, zd_expected);
18309}
18310
18311TEST_SVE(sve_frecpx_d) {
18312 uint64_t zn_inputs[] = {DoubleToRawbits(kFP64PositiveInfinity),
18313 DoubleToRawbits(kFP64NegativeInfinity),
18314 DoubleToRawbits(65504), // Max half precision.
18315 DoubleToRawbits(6.10352e-5), // Min positive normal.
18316 DoubleToRawbits(6.09756e-5), // Max subnormal.
18317 DoubleToRawbits(
18318 5.96046e-8), // Min positive subnormal.
18319 DoubleToRawbits(5e-9), // Not representable -> zero.
18320 DoubleToRawbits(-0.0),
18321 DoubleToRawbits(0.0),
18322 0x7ff5555511111111, // Signalling NaN.
18323 0x7ffaaaaa11111111}; // Quiet NaN;
18324
18325 uint64_t zd_expected[] = {0, // 0.0
18326 0x8000000000000000, // -0.0
18327 0x3f10000000000000, // 6.10352e-05
18328 0x40e0000000000000, // 32768
18329 0x40f0000000000000, // 65536
18330 0x4190000000000000, // 6.71089e+07
18331 0x41c0000000000000, // 5.36871e+08
18332 0xffe0000000000000, // -1.70141e+38
18333 0x7fe0000000000000, // 1.70141e+38
18334 0x7ffd555511111111,
18335 0x7ffaaaaa11111111};
18336
18337 TestFrecpxHelper(config, kDRegSize, zn_inputs, zd_expected);
18338}
TatWai Chong2cb1b612020-03-04 23:51:21 -080018339
TatWai Chongb4a25f62020-02-27 00:53:57 -080018340template <size_t N, typename T>
18341static void TestFsqrtHelper(Test* config,
18342 int lane_size_in_bits,
18343 T (&zn_inputs)[N],
18344 const T (&zd_expected)[N]) {
18345 TestFPUnaryPredicatedHelper(config,
18346 lane_size_in_bits,
18347 lane_size_in_bits,
18348 zn_inputs,
18349 zd_expected,
18350 &MacroAssembler::Fsqrt, // Merging form.
18351 &MacroAssembler::Fsqrt); // Zerging form.
18352}
18353
18354TEST_SVE(sve_fsqrt_h) {
18355 uint64_t zn_inputs[] =
18356 {Float16ToRawbits(Float16(0.0)),
18357 Float16ToRawbits(Float16(-0.0)),
18358 Float16ToRawbits(Float16(1.0)),
18359 Float16ToRawbits(Float16(65025.0)),
18360 Float16ToRawbits(kFP16PositiveInfinity),
18361 Float16ToRawbits(kFP16NegativeInfinity),
18362 Float16ToRawbits(Float16(6.10352e-5)), // Min normal positive.
18363 Float16ToRawbits(Float16(65504.0)), // Max normal positive float.
18364 Float16ToRawbits(Float16(6.09756e-5)), // Max subnormal.
18365 Float16ToRawbits(Float16(5.96046e-8)), // Min subnormal positive.
18366 0x7c22, // Signaling NaN
18367 0x7e55}; // Quiet NaN
18368
18369 uint64_t zd_expected[] = {Float16ToRawbits(Float16(0.0)),
18370 Float16ToRawbits(Float16(-0.0)),
18371 Float16ToRawbits(Float16(1.0)),
18372 Float16ToRawbits(Float16(255.0)),
18373 Float16ToRawbits(kFP16PositiveInfinity),
18374 Float16ToRawbits(kFP16DefaultNaN),
18375 0x2000,
18376 0x5bff,
18377 0x1fff,
18378 0x0c00,
18379 0x7e22, // To quiet NaN.
18380 0x7e55};
18381
18382 TestFsqrtHelper(config, kHRegSize, zn_inputs, zd_expected);
18383}
18384
18385TEST_SVE(sve_fsqrt_s) {
18386 uint64_t zn_inputs[] = {FloatToRawbits(0.0f),
18387 FloatToRawbits(-0.0f),
18388 FloatToRawbits(1.0f),
18389 FloatToRawbits(65536.0f),
18390 FloatToRawbits(kFP32PositiveInfinity),
18391 FloatToRawbits(kFP32NegativeInfinity),
18392 0x00800000, // Min normal positive, ~1.17e−38
18393 0x7f7fffff, // Max normal positive, ~3.40e+38
18394 0x00000001, // Min subnormal positive, ~1.40e−45
18395 0x007fffff, // Max subnormal, ~1.17e−38
18396 0x7f951111, // Signaling NaN
18397 0x7fea1111}; // Quiet NaN
18398
18399 uint64_t zd_expected[] = {FloatToRawbits(0.0f),
18400 FloatToRawbits(-0.0f),
18401 FloatToRawbits(1.0f),
18402 FloatToRawbits(256.0f),
18403 FloatToRawbits(kFP32PositiveInfinity),
18404 FloatToRawbits(kFP32DefaultNaN),
18405 0x20000000, // ~1.08e-19
18406 0x5f7fffff, // ~1.84e+19
18407 0x1a3504f3, // ~3.74e-23
18408 0x1fffffff, // ~1.08e-19
18409 0x7fd51111, // To quiet NaN.
18410 0x7fea1111};
18411
18412 TestFsqrtHelper(config, kSRegSize, zn_inputs, zd_expected);
18413}
18414
18415TEST_SVE(sve_fsqrt_d) {
18416 uint64_t zn_inputs[] =
18417 {DoubleToRawbits(0.0),
18418 DoubleToRawbits(-0.0),
18419 DoubleToRawbits(1.0),
18420 DoubleToRawbits(65536.0),
18421 DoubleToRawbits(kFP64PositiveInfinity),
18422 DoubleToRawbits(kFP64NegativeInfinity),
18423 0x0010000000000000, // Min normal positive, ~2.22e-308
18424 0x7fefffffffffffff, // Max normal positive, ~1.79e+308
18425 0x0000000000000001, // Min subnormal positive, 5e-324
18426 0x000fffffffffffff, // Max subnormal, ~2.22e-308
18427 0x7ff5555511111111,
18428 0x7ffaaaaa11111111};
18429
18430 uint64_t zd_expected[] = {DoubleToRawbits(0.0),
18431 DoubleToRawbits(-0.0),
18432 DoubleToRawbits(1.0),
18433 DoubleToRawbits(256.0),
18434 DoubleToRawbits(kFP64PositiveInfinity),
18435 DoubleToRawbits(kFP64DefaultNaN),
18436 0x2000000000000000, // ~1.49e-154
18437 0x5fefffffffffffff, // ~1.34e+154
18438 0x1e60000000000000, // ~2.22e-162
18439 0x1fffffffffffffff, // ~1.49e-154
18440 0x7ffd555511111111, // To quiet NaN.
18441 0x7ffaaaaa11111111};
18442
18443 TestFsqrtHelper(config, kDRegSize, zn_inputs, zd_expected);
18444}
18445
Martyn Capewell48522f52020-03-16 15:31:19 +000018446TEST_SVE(sve_adr) {
18447 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18448 START();
18449
18450 __ Index(z0.VnD(), 0x10000000f0000000, 0x1000);
18451 __ Index(z1.VnD(), 1, 3);
18452 __ Index(z2.VnS(), -1, -1);
18453 __ Adr(z3.VnD(), SVEMemOperand(z0.VnD(), z1.VnD()));
18454 __ Adr(z4.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 1));
18455 __ Adr(z5.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 2));
18456 __ Adr(z6.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 3));
18457 __ Adr(z7.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW));
18458 __ Adr(z8.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 1));
18459 __ Adr(z9.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 2));
18460 __ Adr(z10.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 3));
18461 __ Adr(z11.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW));
18462 __ Adr(z12.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 1));
18463 __ Adr(z13.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 2));
18464 __ Adr(z14.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 3));
18465 __ Adr(z15.VnS(), SVEMemOperand(z0.VnS(), z2.VnS()));
18466 __ Adr(z16.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 1));
18467 __ Adr(z17.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 2));
18468 __ Adr(z18.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 3));
18469
18470 END();
18471
18472 if (CAN_RUN()) {
18473 RUN();
18474 uint64_t expected_z3[] = {0x10000000f0001004, 0x10000000f0000001};
18475 uint64_t expected_z4[] = {0x10000000f0001008, 0x10000000f0000002};
18476 uint64_t expected_z5[] = {0x10000000f0001010, 0x10000000f0000004};
18477 uint64_t expected_z6[] = {0x10000000f0001020, 0x10000000f0000008};
18478 uint64_t expected_z7[] = {0x10000001f0000ffd, 0x10000001efffffff};
18479 uint64_t expected_z8[] = {0x10000002f0000ffa, 0x10000002effffffe};
18480 uint64_t expected_z9[] = {0x10000004f0000ff4, 0x10000004effffffc};
18481 uint64_t expected_z10[] = {0x10000008f0000fe8, 0x10000008effffff8};
18482 uint64_t expected_z11[] = {0x10000000f0000ffd, 0x10000000efffffff};
18483 uint64_t expected_z12[] = {0x10000000f0000ffa, 0x10000000effffffe};
18484 uint64_t expected_z13[] = {0x10000000f0000ff4, 0x10000000effffffc};
18485 uint64_t expected_z14[] = {0x10000000f0000fe8, 0x10000000effffff8};
18486 uint64_t expected_z15[] = {0x0ffffffcf0000ffd, 0x0ffffffeefffffff};
18487 uint64_t expected_z16[] = {0x0ffffff8f0000ffa, 0x0ffffffceffffffe};
18488 uint64_t expected_z17[] = {0x0ffffff0f0000ff4, 0x0ffffff8effffffc};
18489 uint64_t expected_z18[] = {0x0fffffe0f0000fe8, 0x0ffffff0effffff8};
18490
18491 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
18492 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
18493 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
18494 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
18495 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
18496 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
18497 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
18498 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
18499 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
18500 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
18501 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
18502 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
18503 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
18504 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
18505 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
18506 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
18507 }
18508}
18509
TatWai Chong85e15102020-05-04 21:00:40 -070018510// Test loads and broadcast by comparing them with the result of a set of
18511// equivalent scalar loads.
18512template <typename F>
18513static void LoadBcastHelper(Test* config,
18514 unsigned msize_in_bits,
18515 unsigned esize_in_bits,
18516 F sve_ld1,
18517 bool is_signed) {
18518 VIXL_ASSERT((esize_in_bits == kBRegSize) || (esize_in_bits == kHRegSize) ||
18519 (esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
18520 static const unsigned kMaxLaneCount = kZRegMaxSize / kBRegSize;
18521
18522 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18523 START();
18524
18525 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
18526 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
18527 int vl = config->sve_vl_in_bytes();
18528
18529 uint64_t offsets[kMaxLaneCount];
18530 uint64_t buffer_size = vl * 64;
18531 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
18532 BufferFillingHelper(data,
18533 buffer_size,
18534 msize_in_bytes,
18535 kMaxLaneCount,
18536 offsets);
18537
18538 for (unsigned i = 0; i < (kMaxLaneCount / 2); i++) {
18539 // Assign encodable offsets into the first part of the offset array so
18540 // that both encodable and unencodable offset can be tested.
18541 // Note that the encoding bit range of immediate offset is 6 bits.
18542 offsets[i] = (offsets[i] % (UINT64_C(1) << 6)) * msize_in_bytes;
18543 }
18544
18545 ZRegister zn = z0.WithLaneSize(esize_in_bits);
18546 ZRegister zn_ref = z4.WithLaneSize(esize_in_bits);
18547
18548 PRegisterZ pg = p0.Zeroing();
18549 Initialise(&masm,
18550 pg,
18551 0x9abcdef012345678,
18552 0xabcdef0123456789,
18553 0xf4f3f1f0fefdfcfa,
18554 0xf9f8f6f5f3f2f0ff);
18555
18556 __ Mov(x2, data);
18557 uint64_t enablable_offset = offsets[0];
18558 // Simple check if the operation correct in a single offset.
18559 (masm.*sve_ld1)(zn, pg, SVEMemOperand(x2, enablable_offset));
18560
18561 // Generate a reference result using scalar loads.
18562 uint64_t address = data + enablable_offset;
18563 uint64_t duplicated_addresses[kMaxLaneCount];
18564 for (unsigned i = 0; i < kMaxLaneCount; i++) {
18565 duplicated_addresses[i] = address;
18566 }
18567
18568 ScalarLoadHelper(&masm,
18569 vl,
18570 duplicated_addresses,
18571 zn_ref,
18572 pg,
18573 esize_in_bits,
18574 msize_in_bits,
18575 is_signed);
18576
18577 ZRegister zn_agg = z10.WithLaneSize(esize_in_bits);
18578 ZRegister zn_agg_ref = z11.WithLaneSize(esize_in_bits);
18579 ZRegister zn_temp = z12.WithLaneSize(esize_in_bits);
18580
18581 __ Dup(zn_agg, 0);
18582 __ Dup(zn_agg_ref, 0);
18583
18584 // Check if the operation correct in different offsets.
18585 for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
18586 (masm.*sve_ld1)(zn_temp, pg, SVEMemOperand(x2, offsets[i]));
18587 __ Lastb(x1, pg, zn_temp);
18588 __ Insr(zn_agg, x1);
18589
18590 __ Mov(x3, data + offsets[i]);
18591 ScalarLoadHelper(&masm, x1, x3, msize_in_bits, is_signed);
18592 __ Insr(zn_agg_ref, x1);
18593 }
18594
18595 END();
18596
18597 if (CAN_RUN()) {
18598 RUN();
18599
18600 ASSERT_EQUAL_SVE(zn_ref, zn);
18601 ASSERT_EQUAL_SVE(zn_agg_ref, zn_agg);
18602 }
18603
18604 free(reinterpret_cast<void*>(data));
18605}
18606
18607TEST_SVE(sve_ld1rb) {
18608 LoadBcastHelper(config, kBRegSize, kBRegSize, &MacroAssembler::Ld1rb, false);
18609 LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rb, false);
18610 LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rb, false);
18611 LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rb, false);
18612}
18613
18614TEST_SVE(sve_ld1rh) {
18615 LoadBcastHelper(config, kHRegSize, kHRegSize, &MacroAssembler::Ld1rh, false);
18616 LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rh, false);
18617 LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rh, false);
18618}
18619
18620TEST_SVE(sve_ld1rw) {
18621 LoadBcastHelper(config, kSRegSize, kSRegSize, &MacroAssembler::Ld1rw, false);
18622 LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rw, false);
18623}
18624
18625TEST_SVE(sve_ld1rd) {
18626 LoadBcastHelper(config, kDRegSize, kDRegSize, &MacroAssembler::Ld1rd, false);
18627}
18628
18629TEST_SVE(sve_ld1rsb) {
18630 LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rsb, true);
18631 LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rsb, true);
18632 LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rsb, true);
18633}
18634
18635TEST_SVE(sve_ld1rsh) {
18636 LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rsh, true);
18637 LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rsh, true);
18638}
18639
18640TEST_SVE(sve_ld1rsw) {
18641 LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rsw, true);
18642}
18643
TatWai Chong3db2c492020-03-29 22:20:41 -070018644TEST_SVE(sve_prefetch_offset) {
18645 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18646
18647 START();
18648
18649 __ Prfb(PLDL1KEEP, p5, SVEMemOperand(z30.VnS(), 0));
18650 __ Prfb(PLDL1STRM, p5, SVEMemOperand(x28, -11, SVE_MUL_VL));
Martyn Capewellecca4b12020-07-02 14:30:50 +010018651 __ Prfb(PLDL2KEEP, p6, SVEMemOperand(x30, x29));
TatWai Chong3db2c492020-03-29 22:20:41 -070018652 __ Prfb(PLDL2STRM, p6, SVEMemOperand(x7, z12.VnS(), UXTW));
18653 __ Prfh(PSTL2KEEP, p6, SVEMemOperand(z0.VnS(), 28));
18654 __ Prfh(PSTL2STRM, p4, SVEMemOperand(x17, -3, SVE_MUL_VL));
Martyn Capewell102e7a52020-07-02 11:24:11 +010018655 __ Prfh(PSTL3KEEP, p3, SVEMemOperand(x0, x0, LSL, 1));
18656 __ Prfh(PSTL3STRM, p4, SVEMemOperand(x20, z0.VnD(), LSL, 1));
TatWai Chong3db2c492020-03-29 22:20:41 -070018657 __ Prfw(PLDL1KEEP, p3, SVEMemOperand(z23.VnD(), 5));
18658 __ Prfw(PLDL1STRM, p1, SVEMemOperand(x4, 10, SVE_MUL_VL));
Martyn Capewell102e7a52020-07-02 11:24:11 +010018659 __ Prfw(PLDL2KEEP, p2, SVEMemOperand(x22, x22, LSL, 2));
18660 __ Prfw(PLDL2STRM, p1, SVEMemOperand(x2, z6.VnS(), SXTW, 2));
TatWai Chong3db2c492020-03-29 22:20:41 -070018661 __ Prfd(PLDL3KEEP, p5, SVEMemOperand(z11.VnD(), 9));
18662 __ Prfd(PLDL3STRM, p3, SVEMemOperand(x0, -24, SVE_MUL_VL));
Martyn Capewell102e7a52020-07-02 11:24:11 +010018663 __ Prfd(PSTL1KEEP, p7, SVEMemOperand(x5, x5, LSL, 3));
18664 __ Prfd(PSTL1STRM, p1, SVEMemOperand(x19, z18.VnS(), SXTW, 3));
TatWai Chong3db2c492020-03-29 22:20:41 -070018665
18666 END();
18667 if (CAN_RUN()) {
18668 RUN();
18669 }
18670}
18671
Martyn Capewell51643312020-08-24 15:58:57 +010018672TEST_SVE(sve2_match_nmatch) {
18673 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18674
18675 START();
18676
18677 __ Ptrue(p0.VnB());
18678 __ Ptrue(p1.VnH());
18679 __ Ptrue(p2.VnS());
18680
18681 // Vector to search is bytes 0 - 7, repeating every eight bytes.
18682 __ Index(z0.VnB(), 0, 1);
18683 __ Dup(z0.VnD(), z0.VnD(), 0);
18684
18685 // Elements to find are (repeated) bytes 0 - 3 in the first segment, 4 - 7
18686 // in the second, 8 - 11 in the third, etc.
18687 __ Index(z1.VnB(), 0, 1);
18688 __ Lsr(z1.VnB(), z1.VnB(), 2);
18689
18690 __ Match(p3.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
18691 __ Match(p4.VnB(), p1.Zeroing(), z0.VnB(), z1.VnB());
18692 __ Nmatch(p0.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
18693
18694 __ Uunpklo(z0.VnH(), z0.VnB());
18695 __ Uunpklo(z1.VnH(), z1.VnB());
18696
18697 __ Match(p5.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
18698 __ Match(p6.VnH(), p2.Zeroing(), z0.VnH(), z1.VnH());
18699 __ Nmatch(p1.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
18700
18701 END();
18702 if (CAN_RUN()) {
18703 RUN();
18704
18705 int p3_exp[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
18706 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
18707 ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
18708 int p4_exp[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
18709 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
18710 ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
18711 int p0_exp[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
18712 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
18713 ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
18714
18715 int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
18716 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1};
18717 ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
18718 int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
18719 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
18720 ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
18721 int p1_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
18722 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0};
18723 ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
18724 }
18725}
18726
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018727TEST_SVE(sve2_saba_uaba) {
18728 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18729
18730 START();
18731
18732 __ Index(z0.VnB(), 0, 1);
18733 __ Dup(z1.VnB(), 0xff);
18734 __ Dup(z2.VnB(), 1);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018735 __ Uaba(z2.VnB(), z2.VnB(), z0.VnB(), z1.VnB());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018736 __ Index(z0.VnB(), 0, -1);
18737
18738 __ Index(z3.VnH(), 0, 1);
18739 __ Index(z4.VnH(), 1, 1);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018740 __ Uaba(z3.VnH(), z3.VnH(), z3.VnH(), z4.VnH());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018741
18742 __ Index(z5.VnS(), 3, 6);
18743 __ Index(z6.VnS(), 5, 6);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018744 __ Uaba(z5.VnS(), z5.VnS(), z5.VnS(), z6.VnS());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018745
18746 __ Index(z7.VnD(), 424, 12);
18747 __ Index(z8.VnD(), 4242, 12);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018748 __ Uaba(z7.VnD(), z7.VnD(), z7.VnD(), z8.VnD());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018749
18750 __ Index(z9.VnH(), -1, -1);
18751 __ Dup(z10.VnB(), 0);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018752 __ Saba(z10.VnB(), z10.VnB(), z9.VnB(), z10.VnB());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018753 __ Index(z11.VnH(), 0x0101, 1);
18754
18755 __ Index(z12.VnH(), 0, 1);
18756 __ Index(z13.VnH(), 0, -1);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018757 __ Saba(z13.VnH(), z13.VnH(), z12.VnH(), z13.VnH());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018758
18759 __ Index(z14.VnS(), 0, 2);
18760 __ Index(z15.VnS(), 0, -2);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018761 __ Saba(z15.VnS(), z15.VnS(), z14.VnS(), z15.VnS());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018762
18763 __ Index(z16.VnD(), 0, 42);
18764 __ Index(z17.VnD(), 0, -42);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018765 __ Saba(z17.VnD(), z17.VnD(), z16.VnD(), z17.VnD());
TatWai Chong236e7ae2020-09-13 14:55:04 -070018766
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018767 END();
18768
18769 if (CAN_RUN()) {
18770 RUN();
18771
18772 ASSERT_EQUAL_SVE(z0, z2);
18773 ASSERT_EQUAL_SVE(z3, z4);
18774 ASSERT_EQUAL_SVE(z5, z6);
18775 ASSERT_EQUAL_SVE(z7, z8);
18776
18777 ASSERT_EQUAL_SVE(z10, z11);
18778 ASSERT_EQUAL_SVE(z12, z13);
18779 ASSERT_EQUAL_SVE(z14, z15);
18780 ASSERT_EQUAL_SVE(z16, z17);
18781 }
18782}
18783
TatWai Chong236e7ae2020-09-13 14:55:04 -070018784TEST_SVE(sve2_integer_multiply_long_vector) {
18785 // The test just check Sqdmull[b|t] and Pmull[b|t], as the way how the element
18786 // operating of the other instructions in the group are likewise.
18787 int32_t zn_inputs_s[] =
18788 {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};
18789
18790 int32_t zm_inputs_s[] =
18791 {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
TatWai Chong1719b712020-09-25 18:16:40 -070018792 int64_t sqdmullb_vec_expected_d[] =
TatWai Chong236e7ae2020-09-13 14:55:04 -070018793 {-8, -32, -72, -128, static_cast<int64_t>(0x8000000100000000), INT64_MAX};
18794
TatWai Chong1719b712020-09-25 18:16:40 -070018795 uint64_t sqdmullt_vec_expected_d[] =
18796 {2, 18, 50, 98, 0x8000000100000000, 0x7ffffffe00000002};
TatWai Chong236e7ae2020-09-13 14:55:04 -070018797
TatWai Chong1719b712020-09-25 18:16:40 -070018798 uint64_t pmullb_vec_expected_d[] = {0x00000001fffffffc,
18799 0x00000003fffffff0,
18800 0x000000020000001c,
18801 0x00000007ffffffc0,
18802 0x3fffffff80000000,
18803 0x4000000000000000};
TatWai Chong236e7ae2020-09-13 14:55:04 -070018804
TatWai Chong1719b712020-09-25 18:16:40 -070018805 uint64_t pmullt_vec_expected_d[] = {0x05,
18806 0x11,
18807 0x15,
18808 0x3fffffff80000000,
18809 0x1555555555555555};
18810
18811 uint64_t sqdmullb_idx_expected_d[] = {0xfffffffffffffff8,
18812 0xfffffffffffffff0,
18813 0xffffffffffffffb8,
18814 0xffffffffffffffa0,
18815 0x8000000100000000,
18816 INT64_MAX};
18817
18818 uint64_t sqdmullt_idx_expected_d[] =
18819 {8, // 2 * zn[11] * zm[8] = 2 * 4 * 1
18820 24, // 2 * zn[9] * zm[8] = 2 * 4 * 3
18821 80, // 2 * zn[7] * zm[4] = 2 * 8 * 5
18822 112, // 2 * zn[5] * zm[4] = 2 * 8 * 7
18823 0x7fffffffffffffff, // 2 * zn[3] * zm[0]
18824 0x8000000100000000}; // 2 * zn[1] * zm[0]
TatWai Chong236e7ae2020-09-13 14:55:04 -070018825
18826 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18827 START();
18828
18829 InsrHelper(&masm, z31.VnS(), zn_inputs_s);
18830 InsrHelper(&masm, z30.VnS(), zm_inputs_s);
18831
18832 __ Sqdmullb(z1.VnD(), z31.VnS(), z30.VnS());
18833 __ Sqdmullt(z2.VnD(), z31.VnS(), z30.VnS());
TatWai Chong1719b712020-09-25 18:16:40 -070018834
TatWai Chong236e7ae2020-09-13 14:55:04 -070018835 __ Pmullb(z3.VnD(), z31.VnS(), z30.VnS());
18836 __ Pmullt(z4.VnD(), z31.VnS(), z30.VnS());
18837
TatWai Chong1719b712020-09-25 18:16:40 -070018838 __ Mov(z7, z30);
18839 __ Mov(z8, z31);
18840 __ Sqdmullb(z5.VnD(), z8.VnS(), z7.VnS(), 2);
18841 __ Sqdmullt(z6.VnD(), z8.VnS(), z7.VnS(), 0);
18842
TatWai Chong236e7ae2020-09-13 14:55:04 -070018843 END();
18844
18845 if (CAN_RUN()) {
18846 RUN();
18847
TatWai Chong1719b712020-09-25 18:16:40 -070018848 ASSERT_EQUAL_SVE(sqdmullb_vec_expected_d, z1.VnD());
18849 ASSERT_EQUAL_SVE(sqdmullt_vec_expected_d, z2.VnD());
18850 ASSERT_EQUAL_SVE(pmullb_vec_expected_d, z3.VnD());
18851 ASSERT_EQUAL_SVE(pmullt_vec_expected_d, z4.VnD());
18852 ASSERT_EQUAL_SVE(sqdmullb_idx_expected_d, z5.VnD());
18853 ASSERT_EQUAL_SVE(sqdmullt_idx_expected_d, z6.VnD());
TatWai Chong236e7ae2020-09-13 14:55:04 -070018854 }
18855}
18856
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000018857} // namespace aarch64
18858} // namespace vixl