blob: e158ef9f8579717b40beeb30734a7488694cb1c8 [file] [log] [blame]
Jacob Bramleyd77a8e42019-02-12 16:52:24 +00001// Copyright 2019, VIXL authors
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are met:
6//
7// * Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// * Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// * Neither the name of ARM Limited nor the names of its contributors may be
13// used to endorse or promote products derived from this software without
14// specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000027#include <cfloat>
28#include <cmath>
29#include <cstdio>
30#include <cstdlib>
31#include <cstring>
TatWai Chong1af34f12020-06-01 20:54:06 -070032#include <functional>
mmc28a1a2c1d32024-02-01 16:43:49 +000033#include <sys/mman.h>
34#include <unistd.h>
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000035
36#include "test-runner.h"
37#include "test-utils.h"
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000038
39#include "aarch64/cpu-aarch64.h"
40#include "aarch64/disasm-aarch64.h"
41#include "aarch64/macro-assembler-aarch64.h"
42#include "aarch64/simulator-aarch64.h"
mmc28a1a2c1d32024-02-01 16:43:49 +000043#include "aarch64/test-utils-aarch64.h"
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000044#include "test-assembler-aarch64.h"
45
Martyn Capewelldba51cc2020-08-27 13:48:26 +010046#define TEST_SVE(name) TEST_SVE_INNER("ASM", name)
47
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000048namespace vixl {
49namespace aarch64 {
50
Jacob Bramley0ce75842019-07-17 18:12:50 +010051// Conveniently initialise P registers with scalar bit patterns. The destination
52// lane size is ignored. This is optimised for call-site clarity, not generated
53// code quality.
Jacob Bramley2eaecf12019-05-01 15:46:34 +010054//
55// Usage:
56//
Jacob Bramley0ce75842019-07-17 18:12:50 +010057// Initialise(&masm, p0, 0x1234); // Sets p0 = 0b'0001'0010'0011'0100
Jacob Bramley2eaecf12019-05-01 15:46:34 +010058void Initialise(MacroAssembler* masm,
Jacob Bramley0ce75842019-07-17 18:12:50 +010059 const PRegister& pd,
60 uint64_t value3,
61 uint64_t value2,
62 uint64_t value1,
63 uint64_t value0) {
64 // Generate a literal pool, as in the array form.
Jacob Bramley2eaecf12019-05-01 15:46:34 +010065 UseScratchRegisterScope temps(masm);
66 Register temp = temps.AcquireX();
67 Label data;
68 Label done;
69
Jacob Bramley2eaecf12019-05-01 15:46:34 +010070 masm->Adr(temp, &data);
Jacob Bramley66e66712019-08-02 17:45:32 +010071 masm->Ldr(pd, SVEMemOperand(temp));
Jacob Bramley2eaecf12019-05-01 15:46:34 +010072 masm->B(&done);
73 {
74 ExactAssemblyScope total(masm, kPRegMaxSizeInBytes);
75 masm->bind(&data);
Jacob Bramley0ce75842019-07-17 18:12:50 +010076 masm->dc64(value0);
77 masm->dc64(value1);
78 masm->dc64(value2);
79 masm->dc64(value3);
Jacob Bramley2eaecf12019-05-01 15:46:34 +010080 }
81 masm->Bind(&done);
82}
Jacob Bramley0ce75842019-07-17 18:12:50 +010083void Initialise(MacroAssembler* masm,
84 const PRegister& pd,
85 uint64_t value2,
86 uint64_t value1,
87 uint64_t value0) {
88 Initialise(masm, pd, 0, value2, value1, value0);
89}
90void Initialise(MacroAssembler* masm,
91 const PRegister& pd,
92 uint64_t value1,
93 uint64_t value0) {
94 Initialise(masm, pd, 0, 0, value1, value0);
95}
96void Initialise(MacroAssembler* masm, const PRegister& pd, uint64_t value0) {
97 Initialise(masm, pd, 0, 0, 0, value0);
98}
99
100// Conveniently initialise P registers by lane. This is optimised for call-site
101// clarity, not generated code quality.
102//
103// Usage:
104//
105// int values[] = { 0x0, 0x1, 0x2 };
106// Initialise(&masm, p0.VnS(), values); // Sets p0 = 0b'0000'0001'0010
107//
108// The rightmost (highest-indexed) array element maps to the lowest-numbered
109// lane. Unspecified lanes are set to 0 (inactive).
110//
111// Each element of the `values` array is mapped onto a lane in `pd`. The
112// architecture only respects the lower bit, and writes zero the upper bits, but
113// other (encodable) values can be specified if required by the test.
114template <typename T, size_t N>
115void Initialise(MacroAssembler* masm,
116 const PRegisterWithLaneSize& pd,
117 const T (&values)[N]) {
118 // Turn the array into 64-bit chunks.
119 uint64_t chunks[4] = {0, 0, 0, 0};
120 VIXL_STATIC_ASSERT(sizeof(chunks) == kPRegMaxSizeInBytes);
121
122 int p_bits_per_lane = pd.GetLaneSizeInBits() / kZRegBitsPerPRegBit;
123 VIXL_ASSERT((64 % p_bits_per_lane) == 0);
124 VIXL_ASSERT((N * p_bits_per_lane) <= kPRegMaxSize);
125
126 uint64_t p_lane_mask = GetUintMask(p_bits_per_lane);
127
128 VIXL_STATIC_ASSERT(N <= kPRegMaxSize);
129 size_t bit = 0;
130 for (int n = static_cast<int>(N - 1); n >= 0; n--) {
131 VIXL_ASSERT(bit < (sizeof(chunks) * kBitsPerByte));
132 uint64_t value = values[n] & p_lane_mask;
133 chunks[bit / 64] |= value << (bit % 64);
134 bit += p_bits_per_lane;
135 }
136
137 Initialise(masm, pd, chunks[3], chunks[2], chunks[1], chunks[0]);
138}
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100139
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000140// Ensure that basic test infrastructure works.
Jacob Bramleye8289202019-07-31 11:25:23 +0100141TEST_SVE(sve_test_infrastructure_z) {
142 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000143 START();
144
Jacob Bramley03c0b512019-02-22 16:42:06 +0000145 __ Mov(x0, 0x0123456789abcdef);
146
147 // Test basic `Insr` behaviour.
148 __ Insr(z0.VnB(), 1);
149 __ Insr(z0.VnB(), 2);
150 __ Insr(z0.VnB(), x0);
151 __ Insr(z0.VnB(), -42);
152 __ Insr(z0.VnB(), 0);
153
154 // Test array inputs.
155 int z1_inputs[] = {3, 4, 5, -42, 0};
156 InsrHelper(&masm, z1.VnH(), z1_inputs);
157
158 // Test that sign-extension works as intended for various lane sizes.
159 __ Dup(z2.VnD(), 0); // Clear the register first.
160 __ Insr(z2.VnB(), -42); // 0xd6
161 __ Insr(z2.VnB(), 0xfe); // 0xfe
162 __ Insr(z2.VnH(), -42); // 0xffd6
163 __ Insr(z2.VnH(), 0xfedc); // 0xfedc
164 __ Insr(z2.VnS(), -42); // 0xffffffd6
165 __ Insr(z2.VnS(), 0xfedcba98); // 0xfedcba98
166 // Use another register for VnD(), so we can support 128-bit Z registers.
167 __ Insr(z3.VnD(), -42); // 0xffffffffffffffd6
168 __ Insr(z3.VnD(), 0xfedcba9876543210); // 0xfedcba9876543210
169
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000170 END();
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000171
Jacob Bramley119bd212019-04-16 10:13:09 +0100172 if (CAN_RUN()) {
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100173 RUN();
Jacob Bramley03c0b512019-02-22 16:42:06 +0000174
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100175 // Test that array checks work properly on a register initialised
176 // lane-by-lane.
177 int z0_inputs_b[] = {0x01, 0x02, 0xef, 0xd6, 0x00};
178 ASSERT_EQUAL_SVE(z0_inputs_b, z0.VnB());
Jacob Bramley03c0b512019-02-22 16:42:06 +0000179
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100180 // Test that lane-by-lane checks work properly on a register initialised
181 // by array.
182 for (size_t i = 0; i < ArrayLength(z1_inputs); i++) {
183 // The rightmost (highest-indexed) array element maps to the
184 // lowest-numbered lane.
185 int lane = static_cast<int>(ArrayLength(z1_inputs) - i - 1);
186 ASSERT_EQUAL_SVE_LANE(z1_inputs[i], z1.VnH(), lane);
Jacob Bramley03c0b512019-02-22 16:42:06 +0000187 }
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100188
189 uint64_t z2_inputs_d[] = {0x0000d6feffd6fedc, 0xffffffd6fedcba98};
190 ASSERT_EQUAL_SVE(z2_inputs_d, z2.VnD());
191 uint64_t z3_inputs_d[] = {0xffffffffffffffd6, 0xfedcba9876543210};
192 ASSERT_EQUAL_SVE(z3_inputs_d, z3.VnD());
Jacob Bramley119bd212019-04-16 10:13:09 +0100193 }
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000194}
195
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100196// Ensure that basic test infrastructure works.
Jacob Bramleye8289202019-07-31 11:25:23 +0100197TEST_SVE(sve_test_infrastructure_p) {
198 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100199 START();
200
201 // Simple cases: move boolean (0 or 1) values.
202
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100203 int p0_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100204 Initialise(&masm, p0.VnB(), p0_inputs);
205
206 int p1_inputs[] = {1, 0, 1, 1, 0, 1, 1, 1};
207 Initialise(&masm, p1.VnH(), p1_inputs);
208
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100209 int p2_inputs[] = {1, 1, 0, 1};
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100210 Initialise(&masm, p2.VnS(), p2_inputs);
211
212 int p3_inputs[] = {0, 1};
213 Initialise(&masm, p3.VnD(), p3_inputs);
214
215 // Advanced cases: move numeric value into architecturally-ignored bits.
216
217 // B-sized lanes get one bit in a P register, so there are no ignored bits.
218
219 // H-sized lanes get two bits in a P register.
220 int p4_inputs[] = {0x3, 0x2, 0x1, 0x0, 0x1, 0x2, 0x3};
221 Initialise(&masm, p4.VnH(), p4_inputs);
222
223 // S-sized lanes get four bits in a P register.
224 int p5_inputs[] = {0xc, 0x7, 0x9, 0x6, 0xf};
225 Initialise(&masm, p5.VnS(), p5_inputs);
226
227 // D-sized lanes get eight bits in a P register.
228 int p6_inputs[] = {0x81, 0xcc, 0x55};
229 Initialise(&masm, p6.VnD(), p6_inputs);
230
231 // The largest possible P register has 32 bytes.
232 int p7_inputs[] = {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
233 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
234 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
235 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f};
236 Initialise(&masm, p7.VnD(), p7_inputs);
237
238 END();
239
240 if (CAN_RUN()) {
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100241 RUN();
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100242
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100243 // Test that lane-by-lane checks work properly. The rightmost
244 // (highest-indexed) array element maps to the lowest-numbered lane.
245 for (size_t i = 0; i < ArrayLength(p0_inputs); i++) {
246 int lane = static_cast<int>(ArrayLength(p0_inputs) - i - 1);
247 ASSERT_EQUAL_SVE_LANE(p0_inputs[i], p0.VnB(), lane);
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100248 }
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100249 for (size_t i = 0; i < ArrayLength(p1_inputs); i++) {
250 int lane = static_cast<int>(ArrayLength(p1_inputs) - i - 1);
251 ASSERT_EQUAL_SVE_LANE(p1_inputs[i], p1.VnH(), lane);
252 }
253 for (size_t i = 0; i < ArrayLength(p2_inputs); i++) {
254 int lane = static_cast<int>(ArrayLength(p2_inputs) - i - 1);
255 ASSERT_EQUAL_SVE_LANE(p2_inputs[i], p2.VnS(), lane);
256 }
257 for (size_t i = 0; i < ArrayLength(p3_inputs); i++) {
258 int lane = static_cast<int>(ArrayLength(p3_inputs) - i - 1);
259 ASSERT_EQUAL_SVE_LANE(p3_inputs[i], p3.VnD(), lane);
260 }
261
262 // Test that array checks work properly on predicates initialised with a
263 // possibly-different lane size.
264 // 0b...11'10'01'00'01'10'11
265 int p4_expected[] = {0x39, 0x1b};
266 ASSERT_EQUAL_SVE(p4_expected, p4.VnD());
267
268 ASSERT_EQUAL_SVE(p5_inputs, p5.VnS());
269
270 // 0b...10000001'11001100'01010101
271 int p6_expected[] = {2, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1};
272 ASSERT_EQUAL_SVE(p6_expected, p6.VnH());
273
274 // 0b...10011100'10011101'10011110'10011111
275 int p7_expected[] = {1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
276 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1};
277 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100278 }
279}
280
Jacob Bramley935b15b2019-07-04 14:09:22 +0100281// Test that writes to V registers clear the high bits of the corresponding Z
282// register.
Jacob Bramleye8289202019-07-31 11:25:23 +0100283TEST_SVE(sve_v_write_clear) {
284 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON,
285 CPUFeatures::kFP,
286 CPUFeatures::kSVE);
Jacob Bramley935b15b2019-07-04 14:09:22 +0100287 START();
288
Josh Sorefb43d6ef2022-08-03 12:47:14 -0400289 // The Simulator has two mechanisms for writing V registers:
Jacob Bramley935b15b2019-07-04 14:09:22 +0100290 // - Write*Register, calling through to SimRegisterBase::Write.
291 // - LogicVRegister::ClearForWrite followed by one or more lane updates.
292 // Try to cover both variants.
293
294 // Prepare some known inputs.
295 uint8_t data[kQRegSizeInBytes];
296 for (size_t i = 0; i < kQRegSizeInBytes; i++) {
297 data[i] = 42 + i;
298 }
299 __ Mov(x10, reinterpret_cast<uintptr_t>(data));
300 __ Fmov(d30, 42.0);
301
Jacob Bramley199339d2019-08-05 18:49:13 +0100302 // Use Index to label the lane indices, so failures are easy to detect and
Jacob Bramley935b15b2019-07-04 14:09:22 +0100303 // diagnose.
304 __ Index(z0.VnB(), 0, 1);
305 __ Index(z1.VnB(), 0, 1);
306 __ Index(z2.VnB(), 0, 1);
307 __ Index(z3.VnB(), 0, 1);
308 __ Index(z4.VnB(), 0, 1);
309
310 __ Index(z10.VnB(), 0, -1);
311 __ Index(z11.VnB(), 0, -1);
312 __ Index(z12.VnB(), 0, -1);
313 __ Index(z13.VnB(), 0, -1);
314 __ Index(z14.VnB(), 0, -1);
315
316 // Instructions using Write*Register (and SimRegisterBase::Write).
317 __ Ldr(b0, MemOperand(x10));
318 __ Fcvt(h1, d30);
319 __ Fmov(s2, 1.5f);
320 __ Fmov(d3, d30);
321 __ Ldr(q4, MemOperand(x10));
322
323 // Instructions using LogicVRegister::ClearForWrite.
324 // These also (incidentally) test that across-lane instructions correctly
325 // ignore the high-order Z register lanes.
326 __ Sminv(b10, v10.V16B());
327 __ Addv(h11, v11.V4H());
328 __ Saddlv(s12, v12.V8H());
329 __ Dup(v13.V8B(), b13, kDRegSizeInBytes);
330 __ Uaddl(v14.V8H(), v14.V8B(), v14.V8B());
331
332 END();
333
334 if (CAN_RUN()) {
335 RUN();
336
337 // Check the Q part first.
338 ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000002a, v0);
339 ASSERT_EQUAL_128(0x0000000000000000, 0x0000000000005140, v1); // 42.0 (f16)
340 ASSERT_EQUAL_128(0x0000000000000000, 0x000000003fc00000, v2); // 1.5 (f32)
341 ASSERT_EQUAL_128(0x0000000000000000, 0x4045000000000000, v3); // 42.0 (f64)
342 ASSERT_EQUAL_128(0x3938373635343332, 0x31302f2e2d2c2b2a, v4);
343 ASSERT_EQUAL_128(0x0000000000000000, 0x00000000000000f1, v10); // -15
344 // 0xf9fa + 0xfbfc + 0xfdfe + 0xff00 -> 0xf2f4
345 ASSERT_EQUAL_128(0x0000000000000000, 0x000000000000f2f4, v11);
346 // 0xfffff1f2 + 0xfffff3f4 + ... + 0xfffffdfe + 0xffffff00 -> 0xffffc6c8
347 ASSERT_EQUAL_128(0x0000000000000000, 0x00000000ffffc6c8, v12);
348 ASSERT_EQUAL_128(0x0000000000000000, 0xf8f8f8f8f8f8f8f8, v13); // [-8] x 8
349 // [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
350 // + [0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 0x0000]
351 // -> [0x01f2, 0x01f4, 0x01f6, 0x01f8, 0x01fa, 0x01fc, 0x01fe, 0x0000]
352 ASSERT_EQUAL_128(0x01f201f401f601f8, 0x01fa01fc01fe0000, v14);
353
354 // Check that the upper lanes are all clear.
355 for (int i = kQRegSizeInBytes; i < core.GetSVELaneCount(kBRegSize); i++) {
356 ASSERT_EQUAL_SVE_LANE(0x00, z0.VnB(), i);
357 ASSERT_EQUAL_SVE_LANE(0x00, z1.VnB(), i);
358 ASSERT_EQUAL_SVE_LANE(0x00, z2.VnB(), i);
359 ASSERT_EQUAL_SVE_LANE(0x00, z3.VnB(), i);
360 ASSERT_EQUAL_SVE_LANE(0x00, z4.VnB(), i);
361 ASSERT_EQUAL_SVE_LANE(0x00, z10.VnB(), i);
362 ASSERT_EQUAL_SVE_LANE(0x00, z11.VnB(), i);
363 ASSERT_EQUAL_SVE_LANE(0x00, z12.VnB(), i);
364 ASSERT_EQUAL_SVE_LANE(0x00, z13.VnB(), i);
365 ASSERT_EQUAL_SVE_LANE(0x00, z14.VnB(), i);
366 }
367 }
368}
369
Jacob Bramleye8289202019-07-31 11:25:23 +0100370static void MlaMlsHelper(Test* config, unsigned lane_size_in_bits) {
371 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley22023df2019-05-14 17:55:43 +0100372 START();
373
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100374 int zd_inputs[] = {0xbb, 0xcc, 0xdd, 0xee};
Jacob Bramley22023df2019-05-14 17:55:43 +0100375 int za_inputs[] = {-39, 1, -3, 2};
376 int zn_inputs[] = {-5, -20, 9, 8};
377 int zm_inputs[] = {9, -5, 4, 5};
378
379 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
380 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
381 ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
382 ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
383
384 // TODO: Use a simple `Dup` once it accepts arbitrary immediates.
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100385 InsrHelper(&masm, zd, zd_inputs);
Jacob Bramley22023df2019-05-14 17:55:43 +0100386 InsrHelper(&masm, za, za_inputs);
387 InsrHelper(&masm, zn, zn_inputs);
388 InsrHelper(&masm, zm, zm_inputs);
389
390 int p0_inputs[] = {1, 1, 0, 1};
391 int p1_inputs[] = {1, 0, 1, 1};
392 int p2_inputs[] = {0, 1, 1, 1};
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100393 int p3_inputs[] = {1, 1, 1, 0};
Jacob Bramley22023df2019-05-14 17:55:43 +0100394
395 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), p0_inputs);
396 Initialise(&masm, p1.WithLaneSize(lane_size_in_bits), p1_inputs);
397 Initialise(&masm, p2.WithLaneSize(lane_size_in_bits), p2_inputs);
398 Initialise(&masm, p3.WithLaneSize(lane_size_in_bits), p3_inputs);
399
400 // The Mla macro automatically selects between mla, mad and movprfx + mla
401 // based on what registers are aliased.
402 ZRegister mla_da_result = z10.WithLaneSize(lane_size_in_bits);
403 ZRegister mla_dn_result = z11.WithLaneSize(lane_size_in_bits);
404 ZRegister mla_dm_result = z12.WithLaneSize(lane_size_in_bits);
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100405 ZRegister mla_d_result = z13.WithLaneSize(lane_size_in_bits);
Jacob Bramley22023df2019-05-14 17:55:43 +0100406
407 __ Mov(mla_da_result, za);
408 __ Mla(mla_da_result, p0.Merging(), mla_da_result, zn, zm);
409
410 __ Mov(mla_dn_result, zn);
411 __ Mla(mla_dn_result, p1.Merging(), za, mla_dn_result, zm);
412
413 __ Mov(mla_dm_result, zm);
414 __ Mla(mla_dm_result, p2.Merging(), za, zn, mla_dm_result);
415
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100416 __ Mov(mla_d_result, zd);
417 __ Mla(mla_d_result, p3.Merging(), za, zn, zm);
Jacob Bramley22023df2019-05-14 17:55:43 +0100418
419 // The Mls macro automatically selects between mls, msb and movprfx + mls
420 // based on what registers are aliased.
421 ZRegister mls_da_result = z20.WithLaneSize(lane_size_in_bits);
422 ZRegister mls_dn_result = z21.WithLaneSize(lane_size_in_bits);
423 ZRegister mls_dm_result = z22.WithLaneSize(lane_size_in_bits);
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100424 ZRegister mls_d_result = z23.WithLaneSize(lane_size_in_bits);
Jacob Bramley22023df2019-05-14 17:55:43 +0100425
426 __ Mov(mls_da_result, za);
427 __ Mls(mls_da_result, p0.Merging(), mls_da_result, zn, zm);
428
429 __ Mov(mls_dn_result, zn);
430 __ Mls(mls_dn_result, p1.Merging(), za, mls_dn_result, zm);
431
432 __ Mov(mls_dm_result, zm);
433 __ Mls(mls_dm_result, p2.Merging(), za, zn, mls_dm_result);
434
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100435 __ Mov(mls_d_result, zd);
436 __ Mls(mls_d_result, p3.Merging(), za, zn, zm);
Jacob Bramley22023df2019-05-14 17:55:43 +0100437
438 END();
439
440 if (CAN_RUN()) {
441 RUN();
442
443 ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
444 ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits));
445 ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits));
446
447 int mla[] = {-84, 101, 33, 42};
448 int mls[] = {6, -99, -39, -38};
449
450 int mla_da_expected[] = {mla[0], mla[1], za_inputs[2], mla[3]};
451 ASSERT_EQUAL_SVE(mla_da_expected, mla_da_result);
452
453 int mla_dn_expected[] = {mla[0], zn_inputs[1], mla[2], mla[3]};
454 ASSERT_EQUAL_SVE(mla_dn_expected, mla_dn_result);
455
456 int mla_dm_expected[] = {zm_inputs[0], mla[1], mla[2], mla[3]};
457 ASSERT_EQUAL_SVE(mla_dm_expected, mla_dm_result);
458
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100459 int mla_d_expected[] = {mla[0], mla[1], mla[2], zd_inputs[3]};
460 ASSERT_EQUAL_SVE(mla_d_expected, mla_d_result);
Jacob Bramley22023df2019-05-14 17:55:43 +0100461
462 int mls_da_expected[] = {mls[0], mls[1], za_inputs[2], mls[3]};
463 ASSERT_EQUAL_SVE(mls_da_expected, mls_da_result);
464
465 int mls_dn_expected[] = {mls[0], zn_inputs[1], mls[2], mls[3]};
466 ASSERT_EQUAL_SVE(mls_dn_expected, mls_dn_result);
467
468 int mls_dm_expected[] = {zm_inputs[0], mls[1], mls[2], mls[3]};
469 ASSERT_EQUAL_SVE(mls_dm_expected, mls_dm_result);
470
Jacob Bramleyae2fc3b2019-05-21 19:24:36 +0100471 int mls_d_expected[] = {mls[0], mls[1], mls[2], zd_inputs[3]};
472 ASSERT_EQUAL_SVE(mls_d_expected, mls_d_result);
Jacob Bramley22023df2019-05-14 17:55:43 +0100473 }
474}
475
Jacob Bramleye8289202019-07-31 11:25:23 +0100476TEST_SVE(sve_mla_mls_b) { MlaMlsHelper(config, kBRegSize); }
477TEST_SVE(sve_mla_mls_h) { MlaMlsHelper(config, kHRegSize); }
478TEST_SVE(sve_mla_mls_s) { MlaMlsHelper(config, kSRegSize); }
479TEST_SVE(sve_mla_mls_d) { MlaMlsHelper(config, kDRegSize); }
Jacob Bramley22023df2019-05-14 17:55:43 +0100480
Jacob Bramleye8289202019-07-31 11:25:23 +0100481TEST_SVE(sve_bitwise_unpredicate_logical) {
482 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chongcfb94212019-05-16 13:30:09 -0700483 START();
484
485 uint64_t z8_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
486 InsrHelper(&masm, z8.VnD(), z8_inputs);
487 uint64_t z15_inputs[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff};
488 InsrHelper(&masm, z15.VnD(), z15_inputs);
489
490 __ And(z1.VnD(), z8.VnD(), z15.VnD());
491 __ Bic(z2.VnD(), z8.VnD(), z15.VnD());
492 __ Eor(z3.VnD(), z8.VnD(), z15.VnD());
493 __ Orr(z4.VnD(), z8.VnD(), z15.VnD());
494
495 END();
496
497 if (CAN_RUN()) {
498 RUN();
499 uint64_t z1_expected[] = {0xfedcaa8854540000, 0x0000454588aacdef};
500 uint64_t z2_expected[] = {0x0000101022003210, 0x0123002201010000};
501 uint64_t z3_expected[] = {0x01235476ab89fedc, 0xcdef98ba67453210};
502 uint64_t z4_expected[] = {0xfffffefeffddfedc, 0xcdefddffefefffff};
503
504 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
505 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
506 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
507 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
508 }
TatWai Chongcfb94212019-05-16 13:30:09 -0700509}
510
Martyn Capewellf804b602020-02-24 18:57:18 +0000511TEST_SVE(sve_last_r) {
512 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
513 START();
514
515 __ Pfalse(p1.VnB());
516 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
517 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
518 Initialise(&masm, p2.VnB(), p2_inputs);
519 Initialise(&masm, p3.VnB(), p3_inputs);
520 __ Ptrue(p4.VnB());
521
522 __ Index(z0.VnB(), 0x10, 1);
523 __ Lasta(x1, p1, z0.VnB());
524 __ Lastb(x2, p1, z0.VnB());
525 __ Lasta(x3, p2, z0.VnB());
526 __ Lastb(x4, p2, z0.VnB());
527 __ Lasta(x5, p3, z0.VnB());
528 __ Lastb(x6, p3, z0.VnB());
529 __ Lasta(x7, p4, z0.VnB());
530
531 __ Punpklo(p3.VnH(), p3.VnB());
532 __ Index(z0.VnH(), 0x1110, 1);
533 __ Lasta(x9, p1, z0.VnH());
534 __ Lastb(x10, p3, z0.VnH());
535 __ Lasta(x12, p4, z0.VnH());
536
537 __ Index(z0.VnS(), 0x11111110, 1);
538 __ Lastb(x13, p1, z0.VnS());
539 __ Lasta(x14, p2, z0.VnS());
540 __ Lastb(x18, p4, z0.VnS());
541
542 __ Index(z0.VnD(), 0x1111111111111110, 1);
543 __ Lasta(x19, p1, z0.VnD());
544 __ Lastb(x20, p3, z0.VnD());
545 __ Lasta(x21, p3, z0.VnD());
546 END();
547
548 if (CAN_RUN()) {
549 RUN();
550
551 ASSERT_EQUAL_64(0x0000000000000010, x1);
552 ASSERT_EQUAL_64(0x0000000000000011, x3);
553 ASSERT_EQUAL_64(0x0000000000000010, x4);
554 ASSERT_EQUAL_64(0x0000000000000019, x5);
555 ASSERT_EQUAL_64(0x0000000000000018, x6);
556 ASSERT_EQUAL_64(0x0000000000000010, x7);
557 ASSERT_EQUAL_64(0x0000000000001110, x9);
558 ASSERT_EQUAL_64(0x0000000000001110, x12);
559 ASSERT_EQUAL_64(0x0000000011111111, x14);
560 ASSERT_EQUAL_64(0x1111111111111110, x19);
561
562 int vl = core.GetSVELaneCount(kBRegSize) * 8;
563 switch (vl) {
564 case 128:
565 ASSERT_EQUAL_64(0x000000000000001f, x2);
566 ASSERT_EQUAL_64(0x0000000000001116, x10);
567 ASSERT_EQUAL_64(0x0000000011111113, x13);
568 ASSERT_EQUAL_64(0x0000000011111113, x18);
569 ASSERT_EQUAL_64(0x1111111111111111, x20);
570 ASSERT_EQUAL_64(0x1111111111111110, x21);
571 break;
Martyn Capewellc40e2882024-03-22 13:47:46 +0000572 case 512:
573 ASSERT_EQUAL_64(0x000000000000004f, x2);
Martyn Capewellf804b602020-02-24 18:57:18 +0000574 ASSERT_EQUAL_64(0x0000000000001118, x10);
Martyn Capewellc40e2882024-03-22 13:47:46 +0000575 ASSERT_EQUAL_64(0x000000001111111f, x13);
576 ASSERT_EQUAL_64(0x000000001111111f, x18);
Martyn Capewellf804b602020-02-24 18:57:18 +0000577 ASSERT_EQUAL_64(0x1111111111111112, x20);
578 ASSERT_EQUAL_64(0x1111111111111113, x21);
579 break;
580 case 2048:
581 ASSERT_EQUAL_64(0x000000000000000f, x2);
582 ASSERT_EQUAL_64(0x0000000000001118, x10);
583 ASSERT_EQUAL_64(0x000000001111114f, x13);
584 ASSERT_EQUAL_64(0x000000001111114f, x18);
585 ASSERT_EQUAL_64(0x1111111111111112, x20);
586 ASSERT_EQUAL_64(0x1111111111111113, x21);
587 break;
588 default:
589 printf("WARNING: Some tests skipped due to unexpected VL.\n");
590 break;
591 }
592 }
593}
594
595TEST_SVE(sve_last_v) {
596 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
597 START();
598
599 __ Pfalse(p1.VnB());
600 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
601 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
602 Initialise(&masm, p2.VnB(), p2_inputs);
603 Initialise(&masm, p3.VnB(), p3_inputs);
604 __ Ptrue(p4.VnB());
605
606 __ Index(z0.VnB(), 0x10, 1);
607 __ Lasta(b1, p1, z0.VnB());
608 __ Lastb(b2, p1, z0.VnB());
609 __ Lasta(b3, p2, z0.VnB());
610 __ Lastb(b4, p2, z0.VnB());
611 __ Lasta(b5, p3, z0.VnB());
612 __ Lastb(b6, p3, z0.VnB());
613 __ Lasta(b7, p4, z0.VnB());
614
615 __ Punpklo(p3.VnH(), p3.VnB());
616 __ Index(z0.VnH(), 0x1110, 1);
617 __ Lasta(h9, p1, z0.VnH());
618 __ Lastb(h10, p3, z0.VnH());
619 __ Lasta(h12, p4, z0.VnH());
620
621 __ Index(z0.VnS(), 0x11111110, 1);
622 __ Lastb(s13, p1, z0.VnS());
623 __ Lasta(s14, p2, z0.VnS());
624 __ Lastb(s18, p4, z0.VnS());
625
626 __ Index(z0.VnD(), 0x1111111111111110, 1);
627 __ Lasta(d19, p1, z0.VnD());
628 __ Lastb(d20, p3, z0.VnD());
629 __ Lasta(d21, p3, z0.VnD());
630 END();
631
632 if (CAN_RUN()) {
633 RUN();
634
635 ASSERT_EQUAL_128(0, 0x0000000000000010, q1);
636 ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
637 ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
638 ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
639 ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
640 ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
641 ASSERT_EQUAL_128(0, 0x0000000000001110, q9);
642 ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
643 ASSERT_EQUAL_128(0, 0x0000000011111111, q14);
644 ASSERT_EQUAL_128(0, 0x1111111111111110, q19);
645
646 int vl = core.GetSVELaneCount(kBRegSize) * 8;
647 switch (vl) {
648 case 128:
649 ASSERT_EQUAL_128(0, 0x000000000000001f, q2);
650 ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
651 ASSERT_EQUAL_128(0, 0x0000000011111113, q13);
652 ASSERT_EQUAL_128(0, 0x0000000011111113, q18);
653 ASSERT_EQUAL_128(0, 0x1111111111111111, q20);
654 ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
655 break;
Martyn Capewellc40e2882024-03-22 13:47:46 +0000656 case 512:
657 ASSERT_EQUAL_128(0, 0x000000000000004f, q2);
Martyn Capewellf804b602020-02-24 18:57:18 +0000658 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
Martyn Capewellc40e2882024-03-22 13:47:46 +0000659 ASSERT_EQUAL_128(0, 0x000000001111111f, q13);
660 ASSERT_EQUAL_128(0, 0x000000001111111f, q18);
Martyn Capewellf804b602020-02-24 18:57:18 +0000661 ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
662 ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
663 break;
664 case 2048:
665 ASSERT_EQUAL_128(0, 0x000000000000000f, q2);
666 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
667 ASSERT_EQUAL_128(0, 0x000000001111114f, q13);
668 ASSERT_EQUAL_128(0, 0x000000001111114f, q18);
669 ASSERT_EQUAL_128(0, 0x1111111111111112, q20);
670 ASSERT_EQUAL_128(0, 0x1111111111111113, q21);
671 break;
672 default:
673 printf("WARNING: Some tests skipped due to unexpected VL.\n");
674 break;
675 }
676 }
677}
678
679TEST_SVE(sve_clast_r) {
680 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
681 START();
682
683 __ Pfalse(p1.VnB());
684 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
685 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
686 Initialise(&masm, p2.VnB(), p2_inputs);
687 Initialise(&masm, p3.VnB(), p3_inputs);
688 __ Ptrue(p4.VnB());
689
690 __ Index(z0.VnB(), 0x10, 1);
691 __ Mov(x1, -1);
692 __ Mov(x2, -1);
693 __ Clasta(x1, p1, x1, z0.VnB());
694 __ Clastb(x2, p1, x2, z0.VnB());
695 __ Clasta(x3, p2, x3, z0.VnB());
696 __ Clastb(x4, p2, x4, z0.VnB());
697 __ Clasta(x5, p3, x5, z0.VnB());
698 __ Clastb(x6, p3, x6, z0.VnB());
699 __ Clasta(x7, p4, x7, z0.VnB());
700
701 __ Punpklo(p3.VnH(), p3.VnB());
702 __ Index(z0.VnH(), 0x1110, 1);
703 __ Mov(x9, -1);
704 __ Clasta(x9, p1, x9, z0.VnH());
705 __ Clastb(x10, p3, x10, z0.VnH());
706 __ Clasta(x12, p4, x12, z0.VnH());
707
708 __ Index(z0.VnS(), 0x11111110, 1);
709 __ Mov(x13, -1);
710 __ Clasta(x13, p1, x13, z0.VnS());
711 __ Clastb(x14, p2, x14, z0.VnS());
712 __ Clasta(x18, p4, x18, z0.VnS());
713
714 __ Index(z0.VnD(), 0x1111111111111110, 1);
715 __ Mov(x19, -1);
716 __ Clasta(x19, p1, x19, z0.VnD());
717 __ Clastb(x20, p2, x20, z0.VnD());
718 __ Clasta(x21, p4, x21, z0.VnD());
719 END();
720
721 if (CAN_RUN()) {
722 RUN();
723 ASSERT_EQUAL_64(0x00000000000000ff, x1);
724 ASSERT_EQUAL_64(0x00000000000000ff, x2);
725 ASSERT_EQUAL_64(0x0000000000000011, x3);
726 ASSERT_EQUAL_64(0x0000000000000010, x4);
727 ASSERT_EQUAL_64(0x0000000000000019, x5);
728 ASSERT_EQUAL_64(0x0000000000000018, x6);
729 ASSERT_EQUAL_64(0x0000000000000010, x7);
730 ASSERT_EQUAL_64(0x000000000000ffff, x9);
731 ASSERT_EQUAL_64(0x0000000000001110, x12);
732 ASSERT_EQUAL_64(0x00000000ffffffff, x13);
733 ASSERT_EQUAL_64(0x0000000011111110, x14);
734 ASSERT_EQUAL_64(0x0000000011111110, x18);
735 ASSERT_EQUAL_64(0xffffffffffffffff, x19);
736 ASSERT_EQUAL_64(0x1111111111111110, x20);
737 ASSERT_EQUAL_64(0x1111111111111110, x21);
738
739 int vl = core.GetSVELaneCount(kBRegSize) * 8;
740 switch (vl) {
741 case 128:
742 ASSERT_EQUAL_64(0x0000000000001116, x10);
743 break;
Martyn Capewellc40e2882024-03-22 13:47:46 +0000744 case 512:
Martyn Capewellf804b602020-02-24 18:57:18 +0000745 ASSERT_EQUAL_64(0x0000000000001118, x10);
746 break;
747 case 2048:
748 ASSERT_EQUAL_64(0x0000000000001118, x10);
749 break;
750 default:
751 printf("WARNING: Some tests skipped due to unexpected VL.\n");
752 break;
753 }
754 }
755}
756
757TEST_SVE(sve_clast_v) {
758 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
759 START();
760
761 __ Pfalse(p1.VnB());
762 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
763 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
764 Initialise(&masm, p2.VnB(), p2_inputs);
765 Initialise(&masm, p3.VnB(), p3_inputs);
766 __ Ptrue(p4.VnB());
767
768 __ Index(z0.VnB(), 0x10, 1);
769 __ Dup(z1.VnB(), -1);
770 __ Dup(z2.VnB(), -1);
771 __ Clasta(b1, p1, b1, z0.VnB());
772 __ Clastb(b2, p1, b2, z0.VnB());
773 __ Clasta(b3, p2, b3, z0.VnB());
774 __ Clastb(b4, p2, b4, z0.VnB());
775 __ Clasta(b5, p3, b5, z0.VnB());
776 __ Clastb(b6, p3, b6, z0.VnB());
777 __ Clasta(b7, p4, b7, z0.VnB());
778
779 __ Punpklo(p3.VnH(), p3.VnB());
780 __ Index(z0.VnH(), 0x1110, 1);
781 __ Dup(z9.VnB(), -1);
782 __ Clasta(h9, p1, h9, z0.VnH());
783 __ Clastb(h10, p3, h10, z0.VnH());
784 __ Clasta(h12, p4, h12, z0.VnH());
785
786 __ Index(z0.VnS(), 0x11111110, 1);
787 __ Dup(z13.VnB(), -1);
788 __ Clasta(s13, p1, s13, z0.VnS());
789 __ Clastb(s14, p2, s14, z0.VnS());
790 __ Clasta(s18, p4, s18, z0.VnS());
791
792 __ Index(z0.VnD(), 0x1111111111111110, 1);
793 __ Dup(z19.VnB(), -1);
794 __ Clasta(d19, p1, d19, z0.VnD());
795 __ Clastb(d20, p2, d20, z0.VnD());
796 __ Clasta(d21, p4, d21, z0.VnD());
797 END();
798
799 if (CAN_RUN()) {
800 RUN();
801 ASSERT_EQUAL_128(0, 0x00000000000000ff, q1);
802 ASSERT_EQUAL_128(0, 0x00000000000000ff, q2);
803 ASSERT_EQUAL_128(0, 0x0000000000000011, q3);
804 ASSERT_EQUAL_128(0, 0x0000000000000010, q4);
805 ASSERT_EQUAL_128(0, 0x0000000000000019, q5);
806 ASSERT_EQUAL_128(0, 0x0000000000000018, q6);
807 ASSERT_EQUAL_128(0, 0x0000000000000010, q7);
808 ASSERT_EQUAL_128(0, 0x000000000000ffff, q9);
809 ASSERT_EQUAL_128(0, 0x0000000000001110, q12);
810 ASSERT_EQUAL_128(0, 0x00000000ffffffff, q13);
811 ASSERT_EQUAL_128(0, 0x0000000011111110, q14);
812 ASSERT_EQUAL_128(0, 0x0000000011111110, q18);
813 ASSERT_EQUAL_128(0, 0xffffffffffffffff, q19);
814 ASSERT_EQUAL_128(0, 0x1111111111111110, q20);
815 ASSERT_EQUAL_128(0, 0x1111111111111110, q21);
816
817 int vl = core.GetSVELaneCount(kBRegSize) * 8;
818 switch (vl) {
819 case 128:
820 ASSERT_EQUAL_128(0, 0x0000000000001116, q10);
821 break;
Martyn Capewellc40e2882024-03-22 13:47:46 +0000822 case 512:
Martyn Capewellf804b602020-02-24 18:57:18 +0000823 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
824 break;
825 case 2048:
826 ASSERT_EQUAL_128(0, 0x0000000000001118, q10);
827 break;
828 default:
829 printf("WARNING: Some tests skipped due to unexpected VL.\n");
830 break;
831 }
832 }
833}
834
835TEST_SVE(sve_clast_z) {
836 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
837 START();
838
839 __ Pfalse(p1.VnB());
840 int p2_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
841 int p3_inputs[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
842 Initialise(&masm, p2.VnB(), p2_inputs);
843 Initialise(&masm, p3.VnB(), p3_inputs);
844 __ Ptrue(p4.VnB());
845
846 __ Index(z0.VnB(), 0x10, 1);
847 __ Dup(z1.VnB(), 0xff);
848 __ Dup(z2.VnB(), 0xff);
849 __ Clasta(z1.VnB(), p1, z1.VnB(), z0.VnB());
850 __ Clastb(z2.VnB(), p1, z2.VnB(), z0.VnB());
851 __ Clasta(z3.VnB(), p2, z3.VnB(), z0.VnB());
852 __ Clastb(z4.VnB(), p2, z4.VnB(), z0.VnB());
853 __ Clasta(z5.VnB(), p3, z5.VnB(), z0.VnB());
854 __ Clastb(z6.VnB(), p3, z6.VnB(), z0.VnB());
855 __ Clasta(z7.VnB(), p4, z7.VnB(), z0.VnB());
856
857 __ Punpklo(p3.VnH(), p3.VnB());
858 __ Index(z0.VnH(), 0x1110, 1);
859 __ Dup(z9.VnB(), 0xff);
860 __ Clasta(z9.VnH(), p1, z9.VnH(), z0.VnH());
861 __ Clastb(z10.VnH(), p3, z10.VnH(), z0.VnH());
862 __ Clasta(z12.VnH(), p4, z12.VnH(), z0.VnH());
863
864 __ Index(z0.VnS(), 0x11111110, 1);
865 __ Dup(z13.VnB(), 0xff);
866 __ Clasta(z13.VnS(), p1, z13.VnS(), z0.VnS());
867 __ Clastb(z14.VnS(), p2, z14.VnS(), z0.VnS());
868 __ Clasta(z16.VnS(), p4, z16.VnS(), z0.VnS());
869
870 __ Index(z0.VnD(), 0x1111111111111110, 1);
871 __ Dup(z17.VnB(), 0xff);
872 __ Clasta(z17.VnD(), p1, z17.VnD(), z0.VnD());
873 __ Clastb(z18.VnD(), p2, z18.VnD(), z0.VnD());
874 __ Clasta(z20.VnD(), p4, z20.VnD(), z0.VnD());
875 END();
876
877 if (CAN_RUN()) {
878 RUN();
879 uint64_t z1_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
880 uint64_t z2_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
881 uint64_t z3_expected[] = {0x1111111111111111, 0x1111111111111111};
882 uint64_t z4_expected[] = {0x1010101010101010, 0x1010101010101010};
883 uint64_t z5_expected[] = {0x1919191919191919, 0x1919191919191919};
884 uint64_t z6_expected[] = {0x1818181818181818, 0x1818181818181818};
885 uint64_t z7_expected[] = {0x1010101010101010, 0x1010101010101010};
886 uint64_t z9_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
887 uint64_t z12_expected[] = {0x1110111011101110, 0x1110111011101110};
888 uint64_t z13_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
889 uint64_t z14_expected[] = {0x1111111011111110, 0x1111111011111110};
890 uint64_t z16_expected[] = {0x1111111011111110, 0x1111111011111110};
891 uint64_t z17_expected[] = {0xffffffffffffffff, 0xffffffffffffffff};
892 uint64_t z18_expected[] = {0x1111111111111110, 0x1111111111111110};
893 uint64_t z20_expected[] = {0x1111111111111110, 0x1111111111111110};
894
895 uint64_t z10_expected_vl128[] = {0x1116111611161116, 0x1116111611161116};
896 uint64_t z10_expected_vl_long[] = {0x1118111811181118, 0x1118111811181118};
897
898 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
899 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
900 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
901 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
902 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
903 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
904 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
905 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
906 ASSERT_EQUAL_SVE(z12_expected, z12.VnD());
907 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
908 ASSERT_EQUAL_SVE(z14_expected, z14.VnD());
909 ASSERT_EQUAL_SVE(z16_expected, z16.VnD());
910 ASSERT_EQUAL_SVE(z17_expected, z17.VnD());
911 ASSERT_EQUAL_SVE(z18_expected, z18.VnD());
912 ASSERT_EQUAL_SVE(z20_expected, z20.VnD());
913
914 int vl = core.GetSVELaneCount(kBRegSize) * 8;
915 switch (vl) {
916 case 128:
917 ASSERT_EQUAL_SVE(z10_expected_vl128, z10.VnD());
918 break;
Martyn Capewellc40e2882024-03-22 13:47:46 +0000919 case 512:
Martyn Capewellf804b602020-02-24 18:57:18 +0000920 case 2048:
921 ASSERT_EQUAL_SVE(z10_expected_vl_long, z10.VnD());
922 break;
923 default:
924 printf("WARNING: Some tests skipped due to unexpected VL.\n");
925 break;
926 }
927 }
928}
929
930TEST_SVE(sve_compact) {
931 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
932 START();
933
934 __ Ptrue(p0.VnB());
935 __ Pfalse(p1.VnB());
936 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
937 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
938 __ Zip1(p4.VnD(), p0.VnD(), p1.VnD());
939
940 __ Index(z0.VnS(), 0x11111111, 0x11111111);
941 __ Mov(q0, q0);
942 __ Compact(z1.VnS(), p0, z0.VnS());
943 __ Compact(z2.VnS(), p2, z0.VnS());
944 __ Compact(z0.VnS(), p3, z0.VnS());
945
946 __ Index(z3.VnD(), 0x1111111111111111, 0x1111111111111111);
947 __ Mov(q3, q3);
948 __ Compact(z4.VnD(), p0, z3.VnD());
949 __ Compact(z5.VnD(), p1, z3.VnD());
950 __ Compact(z6.VnD(), p4, z3.VnD());
951
952 END();
953
954 if (CAN_RUN()) {
955 RUN();
956 uint64_t z1_expected[] = {0x4444444433333333, 0x2222222211111111};
957 uint64_t z2_expected[] = {0x0000000000000000, 0x3333333311111111};
958 uint64_t z0_expected[] = {0x0000000000000000, 0x4444444422222222};
959 uint64_t z4_expected[] = {0x2222222222222222, 0x1111111111111111};
960 uint64_t z5_expected[] = {0x0000000000000000, 0x0000000000000000};
961 uint64_t z6_expected[] = {0x0000000000000000, 0x1111111111111111};
962 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
963 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
964 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
965 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
966 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
967 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
968 }
969}
970
971TEST_SVE(sve_splice) {
972 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
973 START();
974
975 __ Ptrue(p0.VnB());
976 __ Pfalse(p1.VnB());
977 int p2b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
978 int p3b_inputs[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
979 int p4b_inputs[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
980 int p5b_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0};
981 int p6b_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0};
982 Initialise(&masm, p2.VnB(), p2b_inputs);
983 Initialise(&masm, p3.VnB(), p3b_inputs);
984 Initialise(&masm, p4.VnB(), p4b_inputs);
985 Initialise(&masm, p5.VnB(), p5b_inputs);
986 Initialise(&masm, p6.VnB(), p6b_inputs);
987
988 __ Index(z30.VnB(), 1, 1);
989
990 __ Index(z0.VnB(), -1, -1);
991 __ Splice(z0.VnB(), p0, z0.VnB(), z30.VnB());
992 __ Index(z1.VnB(), -1, -1);
993 __ Splice(z1.VnB(), p1, z1.VnB(), z30.VnB());
994 __ Index(z2.VnB(), -1, -1);
995 __ Splice(z2.VnB(), p2, z2.VnB(), z30.VnB());
996 __ Index(z3.VnB(), -1, -1);
997 __ Splice(z3.VnB(), p3, z3.VnB(), z30.VnB());
998 __ Index(z4.VnB(), -1, -1);
999 __ Splice(z4.VnB(), p4, z4.VnB(), z30.VnB());
1000 __ Index(z5.VnB(), -1, -1);
1001 __ Splice(z5.VnB(), p5, z5.VnB(), z30.VnB());
1002 __ Index(z6.VnB(), -1, -1);
1003 __ Splice(z6.VnB(), p6, z6.VnB(), z30.VnB());
1004
1005 int p2h_inputs[] = {0, 0, 0, 0, 0, 0, 1, 0};
1006 int p3h_inputs[] = {0, 0, 1, 0, 0, 0, 1, 0};
1007 Initialise(&masm, p2.VnH(), p2h_inputs);
1008 Initialise(&masm, p3.VnH(), p3h_inputs);
1009
1010 __ Index(z30.VnH(), 1, 1);
1011 __ Index(z29.VnH(), -1, -1);
1012 __ Splice(z7.VnH(), p2, z29.VnH(), z30.VnH());
1013 __ Splice(z8.VnH(), p3, z29.VnH(), z30.VnH());
1014
1015 int p2s_inputs[] = {0, 0, 1, 0};
1016 int p3s_inputs[] = {1, 0, 1, 0};
1017 Initialise(&masm, p2.VnS(), p2s_inputs);
1018 Initialise(&masm, p3.VnS(), p3s_inputs);
1019
1020 __ Index(z30.VnS(), 1, 1);
1021 __ Index(z29.VnS(), -1, -1);
1022 __ Splice(z9.VnS(), p2, z29.VnS(), z30.VnS());
1023 __ Splice(z10.VnS(), p3, z29.VnS(), z30.VnS());
1024
1025 int p2d_inputs[] = {0, 1};
1026 int p3d_inputs[] = {1, 0};
1027 Initialise(&masm, p2.VnD(), p2d_inputs);
1028 Initialise(&masm, p3.VnD(), p3d_inputs);
1029
1030 __ Index(z30.VnD(), 1, 1);
1031 __ Index(z29.VnD(), -1, -1);
1032 __ Splice(z11.VnD(), p2, z29.VnD(), z30.VnD());
1033 __ Splice(z30.VnD(), p3, z29.VnD(), z30.VnD());
1034
1035 END();
1036
1037 if (CAN_RUN()) {
1038 RUN();
1039 uint64_t z0_expected[] = {0xf0f1f2f3f4f5f6f7, 0xf8f9fafbfcfdfeff};
1040 uint64_t z1_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
1041 uint64_t z2_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201ff};
1042 uint64_t z3_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201fe};
1043 uint64_t z4_expected[] = {0x0f0e0d0c0b0a0908, 0x07060504030201f0};
1044 uint64_t z5_expected[] = {0x0c0b0a0908070605, 0x04030201f6f7f8f9};
1045 uint64_t z6_expected[] = {0x01f0f1f2f3f4f5f6, 0xf7f8f9fafbfcfdfe};
1046 uint64_t z7_expected[] = {0x0007000600050004, 0x000300020001fffe};
1047 uint64_t z8_expected[] = {0x000300020001fffa, 0xfffbfffcfffdfffe};
1048 uint64_t z9_expected[] = {0x0000000300000002, 0x00000001fffffffe};
1049 uint64_t z10_expected[] = {0x00000001fffffffc, 0xfffffffdfffffffe};
1050 uint64_t z11_expected[] = {0x0000000000000001, 0xffffffffffffffff};
1051 uint64_t z30_expected[] = {0x0000000000000001, 0xfffffffffffffffe};
1052
1053 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1054 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1055 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
1056 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
1057 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1058 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1059 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
1060 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
1061 ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
1062 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1063 ASSERT_EQUAL_SVE(z10_expected, z10.VnD());
1064 ASSERT_EQUAL_SVE(z11_expected, z11.VnD());
1065 ASSERT_EQUAL_SVE(z30_expected, z30.VnD());
1066 }
1067}
1068
Jacob Bramleye8289202019-07-31 11:25:23 +01001069TEST_SVE(sve_predicate_logical) {
1070 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chongf4fa8222019-06-17 12:08:14 -07001071 START();
1072
1073 // 0b...01011010'10110111
1074 int p10_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1}; // Pm
1075 // 0b...11011001'01010010
1076 int p11_inputs[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0}; // Pn
1077 // 0b...01010101'10110010
1078 int p12_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0}; // pg
1079
1080 Initialise(&masm, p10.VnB(), p10_inputs);
1081 Initialise(&masm, p11.VnB(), p11_inputs);
1082 Initialise(&masm, p12.VnB(), p12_inputs);
1083
1084 __ Ands(p0.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1085 __ Mrs(x0, NZCV);
1086 __ Bics(p1.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1087 __ Mrs(x1, NZCV);
1088 __ Eor(p2.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1089 __ Nand(p3.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1090 __ Nor(p4.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1091 __ Orn(p5.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1092 __ Orr(p6.VnB(), p12.Zeroing(), p11.VnB(), p10.VnB());
1093 __ Sel(p7.VnB(), p12, p11.VnB(), p10.VnB());
1094
1095 END();
1096
1097 if (CAN_RUN()) {
1098 RUN();
1099
1100 // 0b...01010000'00010010
1101 int p0_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
1102 // 0b...00000001'00000000
1103 int p1_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0};
1104 // 0b...00000001'10100000
1105 int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1106 // 0b...00000101'10100000
1107 int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0};
1108 // 0b...00000100'00000000
1109 int p4_expected[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1110 // 0b...01010101'00010010
1111 int p5_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
1112 // 0b...01010001'10110010
1113 int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
1114 // 0b...01011011'00010111
1115 int p7_expected[] = {0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1};
1116
1117 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
1118 ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
1119 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1120 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1121 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
1122 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
1123 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
1124 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
1125
TatWai Chong96713fe2019-06-04 16:39:37 -07001126 ASSERT_EQUAL_32(SVEFirstFlag, w0);
1127 ASSERT_EQUAL_32(SVENotLastFlag, w1);
1128 }
1129}
TatWai Chongf4fa8222019-06-17 12:08:14 -07001130
Jacob Bramleye8289202019-07-31 11:25:23 +01001131TEST_SVE(sve_int_compare_vectors) {
1132 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chong96713fe2019-06-04 16:39:37 -07001133 START();
1134
1135 int z10_inputs[] = {0x00, 0x80, 0xff, 0x7f, 0x00, 0x00, 0x00, 0xff};
1136 int z11_inputs[] = {0x00, 0x00, 0x00, 0x00, 0x80, 0xff, 0x7f, 0xfe};
1137 int p0_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1138 InsrHelper(&masm, z10.VnB(), z10_inputs);
1139 InsrHelper(&masm, z11.VnB(), z11_inputs);
1140 Initialise(&masm, p0.VnB(), p0_inputs);
1141
1142 __ Cmphs(p6.VnB(), p0.Zeroing(), z10.VnB(), z11.VnB());
1143 __ Mrs(x6, NZCV);
1144
1145 uint64_t z12_inputs[] = {0xffffffffffffffff, 0x8000000000000000};
1146 uint64_t z13_inputs[] = {0x0000000000000000, 0x8000000000000000};
1147 int p1_inputs[] = {1, 1};
1148 InsrHelper(&masm, z12.VnD(), z12_inputs);
1149 InsrHelper(&masm, z13.VnD(), z13_inputs);
1150 Initialise(&masm, p1.VnD(), p1_inputs);
1151
1152 __ Cmphi(p7.VnD(), p1.Zeroing(), z12.VnD(), z13.VnD());
1153 __ Mrs(x7, NZCV);
1154
1155 int z14_inputs[] = {0, 32767, -1, -32767, 0, 0, 0, 32766};
1156 int z15_inputs[] = {0, 0, 0, 0, 32767, -1, -32767, 32767};
1157
1158 int p2_inputs[] = {1, 0, 1, 1, 1, 1, 1, 1};
1159 InsrHelper(&masm, z14.VnH(), z14_inputs);
1160 InsrHelper(&masm, z15.VnH(), z15_inputs);
1161 Initialise(&masm, p2.VnH(), p2_inputs);
1162
1163 __ Cmpge(p8.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1164 __ Mrs(x8, NZCV);
1165
1166 __ Cmpeq(p9.VnH(), p2.Zeroing(), z14.VnH(), z15.VnH());
1167 __ Mrs(x9, NZCV);
1168
1169 int z16_inputs[] = {0, -1, 0, 0};
1170 int z17_inputs[] = {0, 0, 2147483647, -2147483648};
1171 int p3_inputs[] = {1, 1, 1, 1};
1172 InsrHelper(&masm, z16.VnS(), z16_inputs);
1173 InsrHelper(&masm, z17.VnS(), z17_inputs);
1174 Initialise(&masm, p3.VnS(), p3_inputs);
1175
1176 __ Cmpgt(p10.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1177 __ Mrs(x10, NZCV);
1178
1179 __ Cmpne(p11.VnS(), p3.Zeroing(), z16.VnS(), z17.VnS());
1180 __ Mrs(x11, NZCV);
1181
1182 // Architectural aliases testing.
1183 __ Cmpls(p12.VnB(), p0.Zeroing(), z11.VnB(), z10.VnB()); // HS
1184 __ Cmplo(p13.VnD(), p1.Zeroing(), z13.VnD(), z12.VnD()); // HI
1185 __ Cmple(p14.VnH(), p2.Zeroing(), z15.VnH(), z14.VnH()); // GE
1186 __ Cmplt(p15.VnS(), p3.Zeroing(), z17.VnS(), z16.VnS()); // GT
1187
1188 END();
1189
1190 if (CAN_RUN()) {
1191 RUN();
1192
1193 int p6_expected[] = {1, 0, 1, 1, 0, 0, 0, 1};
1194 for (size_t i = 0; i < ArrayLength(p6_expected); i++) {
1195 int lane = static_cast<int>(ArrayLength(p6_expected) - i - 1);
1196 ASSERT_EQUAL_SVE_LANE(p6_expected[i], p6.VnB(), lane);
1197 }
1198
1199 int p7_expected[] = {1, 0};
1200 ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
1201
1202 int p8_expected[] = {1, 0, 0, 0, 0, 1, 1, 0};
1203 ASSERT_EQUAL_SVE(p8_expected, p8.VnH());
1204
1205 int p9_expected[] = {1, 0, 0, 0, 0, 0, 0, 0};
1206 ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
1207
1208 int p10_expected[] = {0, 0, 0, 1};
1209 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1210
1211 int p11_expected[] = {0, 1, 1, 1};
1212 ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1213
1214 // Reuse the expected results to verify the architectural aliases.
1215 ASSERT_EQUAL_SVE(p6_expected, p12.VnB());
1216 ASSERT_EQUAL_SVE(p7_expected, p13.VnD());
1217 ASSERT_EQUAL_SVE(p8_expected, p14.VnH());
1218 ASSERT_EQUAL_SVE(p10_expected, p15.VnS());
1219
1220 ASSERT_EQUAL_32(SVEFirstFlag, w6);
1221 ASSERT_EQUAL_32(NoFlag, w7);
1222 ASSERT_EQUAL_32(NoFlag, w8);
1223 ASSERT_EQUAL_32(NoFlag, w9);
1224 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
1225 }
1226}
1227
Jacob Bramleye8289202019-07-31 11:25:23 +01001228TEST_SVE(sve_int_compare_vectors_wide_elements) {
1229 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chong96713fe2019-06-04 16:39:37 -07001230 START();
1231
1232 int src1_inputs_1[] = {0, 1, -1, -128, 127, 100, -66};
1233 int src2_inputs_1[] = {0, -1};
1234 int mask_inputs_1[] = {1, 1, 1, 1, 1, 0, 1};
1235 InsrHelper(&masm, z13.VnB(), src1_inputs_1);
1236 InsrHelper(&masm, z19.VnD(), src2_inputs_1);
1237 Initialise(&masm, p0.VnB(), mask_inputs_1);
1238
1239 __ Cmpge(p2.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1240 __ Mrs(x2, NZCV);
1241 __ Cmpgt(p3.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1242 __ Mrs(x3, NZCV);
1243
1244 int src1_inputs_2[] = {0, 32767, -1, -32767, 1, 1234, 0, 32766};
1245 int src2_inputs_2[] = {0, -32767};
1246 int mask_inputs_2[] = {1, 0, 1, 1, 1, 1, 1, 1};
1247 InsrHelper(&masm, z13.VnH(), src1_inputs_2);
1248 InsrHelper(&masm, z19.VnD(), src2_inputs_2);
1249 Initialise(&masm, p0.VnH(), mask_inputs_2);
1250
1251 __ Cmple(p4.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1252 __ Mrs(x4, NZCV);
1253 __ Cmplt(p5.VnH(), p0.Zeroing(), z13.VnH(), z19.VnD());
1254 __ Mrs(x5, NZCV);
1255
1256 int src1_inputs_3[] = {0, -1, 2147483647, -2147483648};
1257 int src2_inputs_3[] = {0, -2147483648};
1258 int mask_inputs_3[] = {1, 1, 1, 1};
1259 InsrHelper(&masm, z13.VnS(), src1_inputs_3);
1260 InsrHelper(&masm, z19.VnD(), src2_inputs_3);
1261 Initialise(&masm, p0.VnS(), mask_inputs_3);
1262
1263 __ Cmpeq(p6.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1264 __ Mrs(x6, NZCV);
1265 __ Cmpne(p7.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1266 __ Mrs(x7, NZCV);
1267
1268 int src1_inputs_4[] = {0x00, 0x80, 0x7f, 0xff, 0x7f, 0xf0, 0x0f, 0x55};
1269 int src2_inputs_4[] = {0x00, 0x7f};
1270 int mask_inputs_4[] = {1, 1, 1, 1, 0, 1, 1, 1};
1271 InsrHelper(&masm, z13.VnB(), src1_inputs_4);
1272 InsrHelper(&masm, z19.VnD(), src2_inputs_4);
1273 Initialise(&masm, p0.VnB(), mask_inputs_4);
1274
1275 __ Cmplo(p8.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1276 __ Mrs(x8, NZCV);
1277 __ Cmpls(p9.VnB(), p0.Zeroing(), z13.VnB(), z19.VnD());
1278 __ Mrs(x9, NZCV);
1279
1280 int src1_inputs_5[] = {0x0000, 0x8000, 0x7fff, 0xffff};
1281 int src2_inputs_5[] = {0x8000, 0xffff};
1282 int mask_inputs_5[] = {1, 1, 1, 1};
1283 InsrHelper(&masm, z13.VnS(), src1_inputs_5);
1284 InsrHelper(&masm, z19.VnD(), src2_inputs_5);
1285 Initialise(&masm, p0.VnS(), mask_inputs_5);
1286
1287 __ Cmphi(p10.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1288 __ Mrs(x10, NZCV);
1289 __ Cmphs(p11.VnS(), p0.Zeroing(), z13.VnS(), z19.VnD());
1290 __ Mrs(x11, NZCV);
1291
1292 END();
1293
1294 if (CAN_RUN()) {
1295 RUN();
1296 int p2_expected[] = {1, 1, 1, 0, 1, 0, 0};
1297 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
1298
1299 int p3_expected[] = {1, 1, 0, 0, 1, 0, 0};
1300 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
1301
1302 int p4_expected[] = {0x1, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1303 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
1304
1305 int p5_expected[] = {0x0, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0};
1306 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
1307
1308 int p6_expected[] = {0x1, 0x0, 0x0, 0x1};
1309 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
1310
1311 int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
1312 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
1313
1314 int p8_expected[] = {1, 0, 0, 0, 0, 0, 1, 1};
1315 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
1316
1317 int p9_expected[] = {1, 0, 1, 0, 0, 0, 1, 1};
1318 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
1319
1320 int p10_expected[] = {0x0, 0x0, 0x0, 0x0};
1321 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
1322
1323 int p11_expected[] = {0x0, 0x1, 0x0, 0x1};
1324 ASSERT_EQUAL_SVE(p11_expected, p11.VnS());
1325
1326 ASSERT_EQUAL_32(NoFlag, w2);
1327 ASSERT_EQUAL_32(NoFlag, w3);
1328 ASSERT_EQUAL_32(NoFlag, w4);
1329 ASSERT_EQUAL_32(SVENotLastFlag, w5);
1330 ASSERT_EQUAL_32(SVEFirstFlag, w6);
1331 ASSERT_EQUAL_32(SVENotLastFlag, w7);
1332 ASSERT_EQUAL_32(SVEFirstFlag, w8);
1333 ASSERT_EQUAL_32(SVEFirstFlag, w9);
1334 ASSERT_EQUAL_32(SVENotLastFlag | SVENoneFlag, w10);
1335 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w11);
TatWai Chongf4fa8222019-06-17 12:08:14 -07001336 }
TatWai Chongf4fa8222019-06-17 12:08:14 -07001337}
1338
Jacob Bramleye8289202019-07-31 11:25:23 +01001339TEST_SVE(sve_bitwise_imm) {
1340 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chonga1885a52019-04-15 17:19:14 -07001341 START();
1342
1343 // clang-format off
1344 uint64_t z21_inputs[] = {0xfedcba9876543210, 0x0123456789abcdef};
1345 uint32_t z22_inputs[] = {0xfedcba98, 0x76543210, 0x01234567, 0x89abcdef};
1346 uint16_t z23_inputs[] = {0xfedc, 0xba98, 0x7654, 0x3210,
1347 0x0123, 0x4567, 0x89ab, 0xcdef};
1348 uint8_t z24_inputs[] = {0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
1349 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef};
1350 // clang-format on
1351
1352 InsrHelper(&masm, z1.VnD(), z21_inputs);
1353 InsrHelper(&masm, z2.VnS(), z22_inputs);
1354 InsrHelper(&masm, z3.VnH(), z23_inputs);
1355 InsrHelper(&masm, z4.VnB(), z24_inputs);
1356
1357 __ And(z1.VnD(), z1.VnD(), 0x0000ffff0000ffff);
1358 __ And(z2.VnS(), z2.VnS(), 0xff0000ff);
1359 __ And(z3.VnH(), z3.VnH(), 0x0ff0);
1360 __ And(z4.VnB(), z4.VnB(), 0x3f);
1361
1362 InsrHelper(&masm, z5.VnD(), z21_inputs);
1363 InsrHelper(&masm, z6.VnS(), z22_inputs);
1364 InsrHelper(&masm, z7.VnH(), z23_inputs);
1365 InsrHelper(&masm, z8.VnB(), z24_inputs);
1366
1367 __ Eor(z5.VnD(), z5.VnD(), 0x0000ffff0000ffff);
1368 __ Eor(z6.VnS(), z6.VnS(), 0xff0000ff);
1369 __ Eor(z7.VnH(), z7.VnH(), 0x0ff0);
1370 __ Eor(z8.VnB(), z8.VnB(), 0x3f);
1371
1372 InsrHelper(&masm, z9.VnD(), z21_inputs);
1373 InsrHelper(&masm, z10.VnS(), z22_inputs);
1374 InsrHelper(&masm, z11.VnH(), z23_inputs);
1375 InsrHelper(&masm, z12.VnB(), z24_inputs);
1376
1377 __ Orr(z9.VnD(), z9.VnD(), 0x0000ffff0000ffff);
1378 __ Orr(z10.VnS(), z10.VnS(), 0xff0000ff);
1379 __ Orr(z11.VnH(), z11.VnH(), 0x0ff0);
1380 __ Orr(z12.VnB(), z12.VnB(), 0x3f);
1381
Jacob Bramley6069fd42019-06-24 10:20:45 +01001382 {
1383 // The `Dup` macro maps onto either `dup` or `dupm`, but has its own test,
1384 // so here we test `dupm` directly.
1385 ExactAssemblyScope guard(&masm, 4 * kInstructionSize);
1386 __ dupm(z13.VnD(), 0x7ffffff800000000);
1387 __ dupm(z14.VnS(), 0x7ffc7ffc);
1388 __ dupm(z15.VnH(), 0x3ffc);
1389 __ dupm(z16.VnB(), 0xc3);
1390 }
TatWai Chonga1885a52019-04-15 17:19:14 -07001391
1392 END();
1393
1394 if (CAN_RUN()) {
1395 RUN();
1396
1397 // clang-format off
1398 uint64_t z1_expected[] = {0x0000ba9800003210, 0x000045670000cdef};
1399 uint32_t z2_expected[] = {0xfe000098, 0x76000010, 0x01000067, 0x890000ef};
1400 uint16_t z3_expected[] = {0x0ed0, 0x0a90, 0x0650, 0x0210,
1401 0x0120, 0x0560, 0x09a0, 0x0de0};
1402 uint8_t z4_expected[] = {0x3e, 0x1c, 0x3a, 0x18, 0x36, 0x14, 0x32, 0x10,
1403 0x01, 0x23, 0x05, 0x27, 0x09, 0x2b, 0x0d, 0x2f};
1404
1405 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1406 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1407 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1408 ASSERT_EQUAL_SVE(z4_expected, z4.VnB());
1409
1410 uint64_t z5_expected[] = {0xfedc45677654cdef, 0x0123ba9889ab3210};
1411 uint32_t z6_expected[] = {0x01dcba67, 0x895432ef, 0xfe234598, 0x76abcd10};
1412 uint16_t z7_expected[] = {0xf12c, 0xb568, 0x79a4, 0x3de0,
1413 0x0ed3, 0x4a97, 0x865b, 0xc21f};
1414 uint8_t z8_expected[] = {0xc1, 0xe3, 0x85, 0xa7, 0x49, 0x6b, 0x0d, 0x2f,
1415 0x3e, 0x1c, 0x7a, 0x58, 0xb6, 0x94, 0xf2, 0xd0};
1416
1417 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1418 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1419 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1420 ASSERT_EQUAL_SVE(z8_expected, z8.VnB());
1421
1422 uint64_t z9_expected[] = {0xfedcffff7654ffff, 0x0123ffff89abffff};
1423 uint32_t z10_expected[] = {0xffdcbaff, 0xff5432ff, 0xff2345ff, 0xffabcdff};
1424 uint16_t z11_expected[] = {0xfffc, 0xbff8, 0x7ff4, 0x3ff0,
1425 0x0ff3, 0x4ff7, 0x8ffb, 0xcfff};
1426 uint8_t z12_expected[] = {0xff, 0xff, 0xbf, 0xbf, 0x7f, 0x7f, 0x3f, 0x3f,
1427 0x3f, 0x3f, 0x7f, 0x7f, 0xbf, 0xbf, 0xff, 0xff};
1428
1429 ASSERT_EQUAL_SVE(z9_expected, z9.VnD());
1430 ASSERT_EQUAL_SVE(z10_expected, z10.VnS());
1431 ASSERT_EQUAL_SVE(z11_expected, z11.VnH());
1432 ASSERT_EQUAL_SVE(z12_expected, z12.VnB());
1433
1434 uint64_t z13_expected[] = {0x7ffffff800000000, 0x7ffffff800000000};
1435 uint32_t z14_expected[] = {0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc, 0x7ffc7ffc};
1436 uint16_t z15_expected[] = {0x3ffc, 0x3ffc, 0x3ffc, 0x3ffc,
1437 0x3ffc, 0x3ffc, 0x3ffc ,0x3ffc};
1438 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
1439 ASSERT_EQUAL_SVE(z14_expected, z14.VnS());
1440 ASSERT_EQUAL_SVE(z15_expected, z15.VnH());
1441 // clang-format on
1442 }
TatWai Chonga1885a52019-04-15 17:19:14 -07001443}
1444
Jacob Bramleye8289202019-07-31 11:25:23 +01001445TEST_SVE(sve_dup_imm) {
Jacob Bramley6069fd42019-06-24 10:20:45 +01001446 // The `Dup` macro can generate `dup`, `dupm`, and it can synthesise
1447 // unencodable immediates.
1448
Jacob Bramleye8289202019-07-31 11:25:23 +01001449 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley6069fd42019-06-24 10:20:45 +01001450 START();
1451
1452 // Encodable with `dup` (shift 0).
1453 __ Dup(z0.VnD(), -1);
1454 __ Dup(z1.VnS(), 0x7f);
1455 __ Dup(z2.VnH(), -0x80);
1456 __ Dup(z3.VnB(), 42);
1457
1458 // Encodable with `dup` (shift 8).
TatWai Chong6995bfd2019-09-26 10:48:05 +01001459 __ Dup(z4.VnD(), -42 * 256);
1460 __ Dup(z5.VnS(), -0x8000);
1461 __ Dup(z6.VnH(), 0x7f00);
Jacob Bramley6069fd42019-06-24 10:20:45 +01001462 // B-sized lanes cannot take a shift of 8.
1463
1464 // Encodable with `dupm` (but not `dup`).
1465 __ Dup(z10.VnD(), 0x3fc);
1466 __ Dup(z11.VnS(), -516097); // 0xfff81fff, as a signed int.
1467 __ Dup(z12.VnH(), 0x0001);
1468 // All values that fit B-sized lanes are encodable with `dup`.
1469
1470 // Cases that require immediate synthesis.
1471 __ Dup(z20.VnD(), 0x1234);
1472 __ Dup(z21.VnD(), -4242);
1473 __ Dup(z22.VnD(), 0xfedcba9876543210);
1474 __ Dup(z23.VnS(), 0x01020304);
1475 __ Dup(z24.VnS(), -0x01020304);
1476 __ Dup(z25.VnH(), 0x3c38);
1477 // All values that fit B-sized lanes are directly encodable.
1478
1479 END();
1480
1481 if (CAN_RUN()) {
1482 RUN();
1483
1484 ASSERT_EQUAL_SVE(0xffffffffffffffff, z0.VnD());
1485 ASSERT_EQUAL_SVE(0x0000007f, z1.VnS());
1486 ASSERT_EQUAL_SVE(0xff80, z2.VnH());
1487 ASSERT_EQUAL_SVE(0x2a, z3.VnB());
1488
TatWai Chong6995bfd2019-09-26 10:48:05 +01001489 ASSERT_EQUAL_SVE(0xffffffffffffd600, z4.VnD());
1490 ASSERT_EQUAL_SVE(0xffff8000, z5.VnS());
1491 ASSERT_EQUAL_SVE(0x7f00, z6.VnH());
Jacob Bramley6069fd42019-06-24 10:20:45 +01001492
1493 ASSERT_EQUAL_SVE(0x00000000000003fc, z10.VnD());
1494 ASSERT_EQUAL_SVE(0xfff81fff, z11.VnS());
1495 ASSERT_EQUAL_SVE(0x0001, z12.VnH());
1496
1497 ASSERT_EQUAL_SVE(0x1234, z20.VnD());
1498 ASSERT_EQUAL_SVE(0xffffffffffffef6e, z21.VnD());
1499 ASSERT_EQUAL_SVE(0xfedcba9876543210, z22.VnD());
1500 ASSERT_EQUAL_SVE(0x01020304, z23.VnS());
1501 ASSERT_EQUAL_SVE(0xfefdfcfc, z24.VnS());
1502 ASSERT_EQUAL_SVE(0x3c38, z25.VnH());
1503 }
1504}
1505
Jacob Bramleye8289202019-07-31 11:25:23 +01001506TEST_SVE(sve_inc_dec_p_scalar) {
1507 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001508 START();
1509
1510 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1511 Initialise(&masm, p0.VnB(), p0_inputs);
1512
1513 int p0_b_count = 9;
1514 int p0_h_count = 5;
1515 int p0_s_count = 3;
1516 int p0_d_count = 2;
1517
1518 // 64-bit operations preserve their high bits.
1519 __ Mov(x0, 0x123456780000002a);
1520 __ Decp(x0, p0.VnB());
1521
1522 __ Mov(x1, 0x123456780000002a);
1523 __ Incp(x1, p0.VnH());
1524
1525 // Check that saturation does not occur.
1526 __ Mov(x10, 1);
1527 __ Decp(x10, p0.VnS());
1528
1529 __ Mov(x11, UINT64_MAX);
1530 __ Incp(x11, p0.VnD());
1531
1532 __ Mov(x12, INT64_MAX);
1533 __ Incp(x12, p0.VnB());
1534
1535 // With an all-true predicate, these instructions increment or decrement by
1536 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001537 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001538
1539 __ Mov(x20, 0x4000000000000000);
1540 __ Decp(x20, p15.VnB());
1541
1542 __ Mov(x21, 0x4000000000000000);
1543 __ Incp(x21, p15.VnH());
1544
1545 END();
1546 if (CAN_RUN()) {
1547 RUN();
1548
1549 ASSERT_EQUAL_64(0x123456780000002a - p0_b_count, x0);
1550 ASSERT_EQUAL_64(0x123456780000002a + p0_h_count, x1);
1551
1552 ASSERT_EQUAL_64(UINT64_C(1) - p0_s_count, x10);
1553 ASSERT_EQUAL_64(UINT64_MAX + p0_d_count, x11);
1554 ASSERT_EQUAL_64(static_cast<uint64_t>(INT64_MAX) + p0_b_count, x12);
1555
1556 ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1557 ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1558 }
1559}
1560
Jacob Bramleye8289202019-07-31 11:25:23 +01001561TEST_SVE(sve_sqinc_sqdec_p_scalar) {
1562 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001563 START();
1564
1565 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1566 Initialise(&masm, p0.VnB(), p0_inputs);
1567
1568 int p0_b_count = 9;
1569 int p0_h_count = 5;
1570 int p0_s_count = 3;
1571 int p0_d_count = 2;
1572
Martyn Capewellacdea502020-11-10 16:12:34 +00001573 uint64_t placeholder_high = 0x1234567800000000;
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001574
1575 // 64-bit operations preserve their high bits.
Martyn Capewellacdea502020-11-10 16:12:34 +00001576 __ Mov(x0, placeholder_high + 42);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001577 __ Sqdecp(x0, p0.VnB());
1578
Martyn Capewellacdea502020-11-10 16:12:34 +00001579 __ Mov(x1, placeholder_high + 42);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001580 __ Sqincp(x1, p0.VnH());
1581
1582 // 32-bit operations sign-extend into their high bits.
Martyn Capewellacdea502020-11-10 16:12:34 +00001583 __ Mov(x2, placeholder_high + 42);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001584 __ Sqdecp(x2, p0.VnS(), w2);
1585
Martyn Capewellacdea502020-11-10 16:12:34 +00001586 __ Mov(x3, placeholder_high + 42);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001587 __ Sqincp(x3, p0.VnD(), w3);
1588
Martyn Capewellacdea502020-11-10 16:12:34 +00001589 __ Mov(x4, placeholder_high + 1);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001590 __ Sqdecp(x4, p0.VnS(), w4);
1591
Martyn Capewellacdea502020-11-10 16:12:34 +00001592 __ Mov(x5, placeholder_high - 1);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001593 __ Sqincp(x5, p0.VnD(), w5);
1594
1595 // Check that saturation behaves correctly.
1596 __ Mov(x10, 0x8000000000000001); // INT64_MIN + 1
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001597 __ Sqdecp(x10, p0.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001598
Martyn Capewellacdea502020-11-10 16:12:34 +00001599 __ Mov(x11, placeholder_high + 0x80000001); // INT32_MIN + 1
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001600 __ Sqdecp(x11, p0.VnH(), w11);
1601
1602 __ Mov(x12, 1);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001603 __ Sqdecp(x12, p0.VnS());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001604
Martyn Capewellacdea502020-11-10 16:12:34 +00001605 __ Mov(x13, placeholder_high + 1);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001606 __ Sqdecp(x13, p0.VnD(), w13);
1607
1608 __ Mov(x14, 0x7ffffffffffffffe); // INT64_MAX - 1
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001609 __ Sqincp(x14, p0.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001610
Martyn Capewellacdea502020-11-10 16:12:34 +00001611 __ Mov(x15, placeholder_high + 0x7ffffffe); // INT32_MAX - 1
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001612 __ Sqincp(x15, p0.VnH(), w15);
1613
1614 // Don't use x16 and x17 since they are scratch registers by default.
1615
1616 __ Mov(x18, 0xffffffffffffffff);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001617 __ Sqincp(x18, p0.VnS());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001618
Martyn Capewellacdea502020-11-10 16:12:34 +00001619 __ Mov(x19, placeholder_high + 0xffffffff);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001620 __ Sqincp(x19, p0.VnD(), w19);
1621
Martyn Capewellacdea502020-11-10 16:12:34 +00001622 __ Mov(x20, placeholder_high + 0xffffffff);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001623 __ Sqdecp(x20, p0.VnB(), w20);
1624
1625 // With an all-true predicate, these instructions increment or decrement by
1626 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001627 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001628
1629 __ Mov(x21, 0);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001630 __ Sqdecp(x21, p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001631
1632 __ Mov(x22, 0);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00001633 __ Sqincp(x22, p15.VnH());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001634
Martyn Capewellacdea502020-11-10 16:12:34 +00001635 __ Mov(x23, placeholder_high);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001636 __ Sqdecp(x23, p15.VnS(), w23);
1637
Martyn Capewellacdea502020-11-10 16:12:34 +00001638 __ Mov(x24, placeholder_high);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001639 __ Sqincp(x24, p15.VnD(), w24);
1640
1641 END();
1642 if (CAN_RUN()) {
1643 RUN();
1644
1645 // 64-bit operations preserve their high bits.
Martyn Capewellacdea502020-11-10 16:12:34 +00001646 ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
1647 ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001648
1649 // 32-bit operations sign-extend into their high bits.
1650 ASSERT_EQUAL_64(42 - p0_s_count, x2);
1651 ASSERT_EQUAL_64(42 + p0_d_count, x3);
1652 ASSERT_EQUAL_64(0xffffffff00000000 | (1 - p0_s_count), x4);
1653 ASSERT_EQUAL_64(p0_d_count - 1, x5);
1654
1655 // Check that saturation behaves correctly.
1656 ASSERT_EQUAL_64(INT64_MIN, x10);
1657 ASSERT_EQUAL_64(INT32_MIN, x11);
1658 ASSERT_EQUAL_64(1 - p0_s_count, x12);
1659 ASSERT_EQUAL_64(1 - p0_d_count, x13);
1660 ASSERT_EQUAL_64(INT64_MAX, x14);
1661 ASSERT_EQUAL_64(INT32_MAX, x15);
1662 ASSERT_EQUAL_64(p0_s_count - 1, x18);
1663 ASSERT_EQUAL_64(p0_d_count - 1, x19);
1664 ASSERT_EQUAL_64(-1 - p0_b_count, x20);
1665
1666 // Check all-true predicates.
1667 ASSERT_EQUAL_64(-core.GetSVELaneCount(kBRegSize), x21);
1668 ASSERT_EQUAL_64(core.GetSVELaneCount(kHRegSize), x22);
1669 ASSERT_EQUAL_64(-core.GetSVELaneCount(kSRegSize), x23);
1670 ASSERT_EQUAL_64(core.GetSVELaneCount(kDRegSize), x24);
1671 }
1672}
1673
Jacob Bramleye8289202019-07-31 11:25:23 +01001674TEST_SVE(sve_uqinc_uqdec_p_scalar) {
1675 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001676 START();
1677
1678 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1679 Initialise(&masm, p0.VnB(), p0_inputs);
1680
1681 int p0_b_count = 9;
1682 int p0_h_count = 5;
1683 int p0_s_count = 3;
1684 int p0_d_count = 2;
1685
Martyn Capewellacdea502020-11-10 16:12:34 +00001686 uint64_t placeholder_high = 0x1234567800000000;
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001687
1688 // 64-bit operations preserve their high bits.
Martyn Capewellacdea502020-11-10 16:12:34 +00001689 __ Mov(x0, placeholder_high + 42);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001690 __ Uqdecp(x0, p0.VnB());
1691
Martyn Capewellacdea502020-11-10 16:12:34 +00001692 __ Mov(x1, placeholder_high + 42);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001693 __ Uqincp(x1, p0.VnH());
1694
1695 // 32-bit operations zero-extend into their high bits.
Martyn Capewellacdea502020-11-10 16:12:34 +00001696 __ Mov(x2, placeholder_high + 42);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001697 __ Uqdecp(x2, p0.VnS(), w2);
1698
Martyn Capewellacdea502020-11-10 16:12:34 +00001699 __ Mov(x3, placeholder_high + 42);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001700 __ Uqincp(x3, p0.VnD(), w3);
1701
Martyn Capewellacdea502020-11-10 16:12:34 +00001702 __ Mov(x4, placeholder_high + 0x80000001);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001703 __ Uqdecp(x4, p0.VnS(), w4);
1704
Martyn Capewellacdea502020-11-10 16:12:34 +00001705 __ Mov(x5, placeholder_high + 0x7fffffff);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001706 __ Uqincp(x5, p0.VnD(), w5);
1707
1708 // Check that saturation behaves correctly.
1709 __ Mov(x10, 1);
1710 __ Uqdecp(x10, p0.VnB(), x10);
1711
Martyn Capewellacdea502020-11-10 16:12:34 +00001712 __ Mov(x11, placeholder_high + 1);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001713 __ Uqdecp(x11, p0.VnH(), w11);
1714
1715 __ Mov(x12, 0x8000000000000000); // INT64_MAX + 1
1716 __ Uqdecp(x12, p0.VnS(), x12);
1717
Martyn Capewellacdea502020-11-10 16:12:34 +00001718 __ Mov(x13, placeholder_high + 0x80000000); // INT32_MAX + 1
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001719 __ Uqdecp(x13, p0.VnD(), w13);
1720
1721 __ Mov(x14, 0xfffffffffffffffe); // UINT64_MAX - 1
1722 __ Uqincp(x14, p0.VnB(), x14);
1723
Martyn Capewellacdea502020-11-10 16:12:34 +00001724 __ Mov(x15, placeholder_high + 0xfffffffe); // UINT32_MAX - 1
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001725 __ Uqincp(x15, p0.VnH(), w15);
1726
1727 // Don't use x16 and x17 since they are scratch registers by default.
1728
1729 __ Mov(x18, 0x7ffffffffffffffe); // INT64_MAX - 1
1730 __ Uqincp(x18, p0.VnS(), x18);
1731
Martyn Capewellacdea502020-11-10 16:12:34 +00001732 __ Mov(x19, placeholder_high + 0x7ffffffe); // INT32_MAX - 1
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001733 __ Uqincp(x19, p0.VnD(), w19);
1734
1735 // With an all-true predicate, these instructions increment or decrement by
1736 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001737 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001738
1739 __ Mov(x20, 0x4000000000000000);
1740 __ Uqdecp(x20, p15.VnB(), x20);
1741
1742 __ Mov(x21, 0x4000000000000000);
1743 __ Uqincp(x21, p15.VnH(), x21);
1744
Martyn Capewellacdea502020-11-10 16:12:34 +00001745 __ Mov(x22, placeholder_high + 0x40000000);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001746 __ Uqdecp(x22, p15.VnS(), w22);
1747
Martyn Capewellacdea502020-11-10 16:12:34 +00001748 __ Mov(x23, placeholder_high + 0x40000000);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001749 __ Uqincp(x23, p15.VnD(), w23);
1750
1751 END();
1752 if (CAN_RUN()) {
1753 RUN();
1754
1755 // 64-bit operations preserve their high bits.
Martyn Capewellacdea502020-11-10 16:12:34 +00001756 ASSERT_EQUAL_64(placeholder_high + 42 - p0_b_count, x0);
1757 ASSERT_EQUAL_64(placeholder_high + 42 + p0_h_count, x1);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001758
1759 // 32-bit operations zero-extend into their high bits.
1760 ASSERT_EQUAL_64(42 - p0_s_count, x2);
1761 ASSERT_EQUAL_64(42 + p0_d_count, x3);
1762 ASSERT_EQUAL_64(UINT64_C(0x80000001) - p0_s_count, x4);
1763 ASSERT_EQUAL_64(UINT64_C(0x7fffffff) + p0_d_count, x5);
1764
1765 // Check that saturation behaves correctly.
1766 ASSERT_EQUAL_64(0, x10);
1767 ASSERT_EQUAL_64(0, x11);
1768 ASSERT_EQUAL_64(0x8000000000000000 - p0_s_count, x12);
1769 ASSERT_EQUAL_64(UINT64_C(0x80000000) - p0_d_count, x13);
1770 ASSERT_EQUAL_64(UINT64_MAX, x14);
1771 ASSERT_EQUAL_64(UINT32_MAX, x15);
1772 ASSERT_EQUAL_64(0x7ffffffffffffffe + p0_s_count, x18);
1773 ASSERT_EQUAL_64(UINT64_C(0x7ffffffe) + p0_d_count, x19);
1774
1775 // Check all-true predicates.
1776 ASSERT_EQUAL_64(0x4000000000000000 - core.GetSVELaneCount(kBRegSize), x20);
1777 ASSERT_EQUAL_64(0x4000000000000000 + core.GetSVELaneCount(kHRegSize), x21);
1778 ASSERT_EQUAL_64(0x40000000 - core.GetSVELaneCount(kSRegSize), x22);
1779 ASSERT_EQUAL_64(0x40000000 + core.GetSVELaneCount(kDRegSize), x23);
1780 }
1781}
1782
Jacob Bramleye8289202019-07-31 11:25:23 +01001783TEST_SVE(sve_inc_dec_p_vector) {
1784 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001785 START();
1786
1787 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
1788 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1789 Initialise(&masm, p0.VnB(), p0_inputs);
1790
1791 // Check that saturation does not occur.
1792
1793 int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
1794 InsrHelper(&masm, z0.VnD(), z0_inputs);
1795
1796 int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
1797 InsrHelper(&masm, z1.VnD(), z1_inputs);
1798
1799 int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
1800 InsrHelper(&masm, z2.VnS(), z2_inputs);
1801
1802 int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
1803 InsrHelper(&masm, z3.VnH(), z3_inputs);
1804
1805 // The MacroAssembler implements non-destructive operations using movprfx.
1806 __ Decp(z10.VnD(), p0, z0.VnD());
1807 __ Decp(z11.VnD(), p0, z1.VnD());
1808 __ Decp(z12.VnS(), p0, z2.VnS());
1809 __ Decp(z13.VnH(), p0, z3.VnH());
1810
1811 __ Incp(z14.VnD(), p0, z0.VnD());
1812 __ Incp(z15.VnD(), p0, z1.VnD());
1813 __ Incp(z16.VnS(), p0, z2.VnS());
1814 __ Incp(z17.VnH(), p0, z3.VnH());
1815
1816 // Also test destructive forms.
1817 __ Mov(z4, z0);
1818 __ Mov(z5, z1);
1819 __ Mov(z6, z2);
1820 __ Mov(z7, z3);
1821
1822 __ Decp(z0.VnD(), p0);
1823 __ Decp(z1.VnD(), p0);
1824 __ Decp(z2.VnS(), p0);
1825 __ Decp(z3.VnH(), p0);
1826
1827 __ Incp(z4.VnD(), p0);
1828 __ Incp(z5.VnD(), p0);
1829 __ Incp(z6.VnS(), p0);
1830 __ Incp(z7.VnH(), p0);
1831
1832 END();
1833 if (CAN_RUN()) {
1834 RUN();
1835
1836 // z0_inputs[...] - number of active D lanes (2)
1837 int64_t z0_expected[] = {0x1234567800000040, -2, -1, 0x7ffffffffffffffe};
1838 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1839
1840 // z1_inputs[...] - number of active D lanes (2)
1841 int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
1842 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1843
1844 // z2_inputs[...] - number of active S lanes (3)
1845 int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, 0x7ffffffd};
1846 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1847
1848 // z3_inputs[...] - number of active H lanes (5)
1849 int16_t z3_expected[] = {0x1225, -5, -4, -6, 0x7ffb, 0x7ffa};
1850 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1851
1852 // z0_inputs[...] + number of active D lanes (2)
1853 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
1854 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
1855
1856 // z1_inputs[...] + number of active D lanes (2)
1857 uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, 0x8000000000000001};
1858 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
1859
1860 // z2_inputs[...] + number of active S lanes (3)
1861 uint32_t z6_expected[] = {0x12340045, 3, 2, 4, 0x80000002, 0x80000003};
1862 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
1863
1864 // z3_inputs[...] + number of active H lanes (5)
1865 uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, 0x8004};
1866 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
1867
1868 // Check that the non-destructive macros produced the same results.
1869 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
1870 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
1871 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
1872 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
1873 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
1874 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
1875 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
1876 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
1877 }
1878}
1879
Jacob Bramleye8289202019-07-31 11:25:23 +01001880TEST_SVE(sve_inc_dec_ptrue_vector) {
1881 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001882 START();
1883
1884 // With an all-true predicate, these instructions increment or decrement by
1885 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01001886 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001887
1888 __ Dup(z0.VnD(), 0);
1889 __ Decp(z0.VnD(), p15);
1890
1891 __ Dup(z1.VnS(), 0);
1892 __ Decp(z1.VnS(), p15);
1893
1894 __ Dup(z2.VnH(), 0);
1895 __ Decp(z2.VnH(), p15);
1896
1897 __ Dup(z3.VnD(), 0);
1898 __ Incp(z3.VnD(), p15);
1899
1900 __ Dup(z4.VnS(), 0);
1901 __ Incp(z4.VnS(), p15);
1902
1903 __ Dup(z5.VnH(), 0);
1904 __ Incp(z5.VnH(), p15);
1905
1906 END();
1907 if (CAN_RUN()) {
1908 RUN();
1909
1910 int d_lane_count = core.GetSVELaneCount(kDRegSize);
1911 int s_lane_count = core.GetSVELaneCount(kSRegSize);
1912 int h_lane_count = core.GetSVELaneCount(kHRegSize);
1913
1914 for (int i = 0; i < d_lane_count; i++) {
1915 ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
1916 ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
1917 }
1918
1919 for (int i = 0; i < s_lane_count; i++) {
1920 ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
1921 ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
1922 }
1923
1924 for (int i = 0; i < h_lane_count; i++) {
1925 ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
1926 ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
1927 }
1928 }
1929}
1930
Jacob Bramleye8289202019-07-31 11:25:23 +01001931TEST_SVE(sve_sqinc_sqdec_p_vector) {
1932 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01001933 START();
1934
1935 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
1936 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
1937 Initialise(&masm, p0.VnB(), p0_inputs);
1938
1939 // Check that saturation behaves correctly.
1940
1941 int64_t z0_inputs[] = {0x1234567800000042, 0, 1, INT64_MIN};
1942 InsrHelper(&masm, z0.VnD(), z0_inputs);
1943
1944 int64_t z1_inputs[] = {0x12345678ffffff2a, 0, -1, INT64_MAX};
1945 InsrHelper(&masm, z1.VnD(), z1_inputs);
1946
1947 int32_t z2_inputs[] = {0x12340042, 0, -1, 1, INT32_MAX, INT32_MIN};
1948 InsrHelper(&masm, z2.VnS(), z2_inputs);
1949
1950 int16_t z3_inputs[] = {0x122a, 0, 1, -1, INT16_MIN, INT16_MAX};
1951 InsrHelper(&masm, z3.VnH(), z3_inputs);
1952
1953 // The MacroAssembler implements non-destructive operations using movprfx.
1954 __ Sqdecp(z10.VnD(), p0, z0.VnD());
1955 __ Sqdecp(z11.VnD(), p0, z1.VnD());
1956 __ Sqdecp(z12.VnS(), p0, z2.VnS());
1957 __ Sqdecp(z13.VnH(), p0, z3.VnH());
1958
1959 __ Sqincp(z14.VnD(), p0, z0.VnD());
1960 __ Sqincp(z15.VnD(), p0, z1.VnD());
1961 __ Sqincp(z16.VnS(), p0, z2.VnS());
1962 __ Sqincp(z17.VnH(), p0, z3.VnH());
1963
1964 // Also test destructive forms.
1965 __ Mov(z4, z0);
1966 __ Mov(z5, z1);
1967 __ Mov(z6, z2);
1968 __ Mov(z7, z3);
1969
1970 __ Sqdecp(z0.VnD(), p0);
1971 __ Sqdecp(z1.VnD(), p0);
1972 __ Sqdecp(z2.VnS(), p0);
1973 __ Sqdecp(z3.VnH(), p0);
1974
1975 __ Sqincp(z4.VnD(), p0);
1976 __ Sqincp(z5.VnD(), p0);
1977 __ Sqincp(z6.VnS(), p0);
1978 __ Sqincp(z7.VnH(), p0);
1979
1980 END();
1981 if (CAN_RUN()) {
1982 RUN();
1983
1984 // z0_inputs[...] - number of active D lanes (2)
1985 int64_t z0_expected[] = {0x1234567800000040, -2, -1, INT64_MIN};
1986 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
1987
1988 // z1_inputs[...] - number of active D lanes (2)
1989 int64_t z1_expected[] = {0x12345678ffffff28, -2, -3, 0x7ffffffffffffffd};
1990 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
1991
1992 // z2_inputs[...] - number of active S lanes (3)
1993 int32_t z2_expected[] = {0x1234003f, -3, -4, -2, 0x7ffffffc, INT32_MIN};
1994 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
1995
1996 // z3_inputs[...] - number of active H lanes (5)
1997 int16_t z3_expected[] = {0x1225, -5, -4, -6, INT16_MIN, 0x7ffa};
1998 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
1999
2000 // z0_inputs[...] + number of active D lanes (2)
2001 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2002 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2003
2004 // z1_inputs[...] + number of active D lanes (2)
2005 uint64_t z5_expected[] = {0x12345678ffffff2c, 2, 1, INT64_MAX};
2006 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2007
2008 // z2_inputs[...] + number of active S lanes (3)
2009 uint32_t z6_expected[] = {0x12340045, 3, 2, 4, INT32_MAX, 0x80000003};
2010 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2011
2012 // z3_inputs[...] + number of active H lanes (5)
2013 uint16_t z7_expected[] = {0x122f, 5, 6, 4, 0x8005, INT16_MAX};
2014 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2015
2016 // Check that the non-destructive macros produced the same results.
2017 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2018 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2019 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2020 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2021 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2022 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2023 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2024 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2025 }
2026}
2027
Jacob Bramleye8289202019-07-31 11:25:23 +01002028TEST_SVE(sve_sqinc_sqdec_ptrue_vector) {
2029 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002030 START();
2031
2032 // With an all-true predicate, these instructions increment or decrement by
2033 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01002034 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002035
2036 __ Dup(z0.VnD(), 0);
2037 __ Sqdecp(z0.VnD(), p15);
2038
2039 __ Dup(z1.VnS(), 0);
2040 __ Sqdecp(z1.VnS(), p15);
2041
2042 __ Dup(z2.VnH(), 0);
2043 __ Sqdecp(z2.VnH(), p15);
2044
2045 __ Dup(z3.VnD(), 0);
2046 __ Sqincp(z3.VnD(), p15);
2047
2048 __ Dup(z4.VnS(), 0);
2049 __ Sqincp(z4.VnS(), p15);
2050
2051 __ Dup(z5.VnH(), 0);
2052 __ Sqincp(z5.VnH(), p15);
2053
2054 END();
2055 if (CAN_RUN()) {
2056 RUN();
2057
2058 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2059 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2060 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2061
2062 for (int i = 0; i < d_lane_count; i++) {
2063 ASSERT_EQUAL_SVE_LANE(-d_lane_count, z0.VnD(), i);
2064 ASSERT_EQUAL_SVE_LANE(d_lane_count, z3.VnD(), i);
2065 }
2066
2067 for (int i = 0; i < s_lane_count; i++) {
2068 ASSERT_EQUAL_SVE_LANE(-s_lane_count, z1.VnS(), i);
2069 ASSERT_EQUAL_SVE_LANE(s_lane_count, z4.VnS(), i);
2070 }
2071
2072 for (int i = 0; i < h_lane_count; i++) {
2073 ASSERT_EQUAL_SVE_LANE(-h_lane_count, z2.VnH(), i);
2074 ASSERT_EQUAL_SVE_LANE(h_lane_count, z5.VnH(), i);
2075 }
2076 }
2077}
2078
Jacob Bramleye8289202019-07-31 11:25:23 +01002079TEST_SVE(sve_uqinc_uqdec_p_vector) {
2080 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002081 START();
2082
2083 // There are {5, 3, 2} active {H, S, D} lanes. B-sized lanes are ignored.
2084 int p0_inputs[] = {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1};
2085 Initialise(&masm, p0.VnB(), p0_inputs);
2086
2087 // Check that saturation behaves correctly.
2088
2089 uint64_t z0_inputs[] = {0x1234567800000042, 0, 1, 0x8000000000000000};
2090 InsrHelper(&masm, z0.VnD(), z0_inputs);
2091
2092 uint64_t z1_inputs[] = {0x12345678ffffff2a, 0, UINT64_MAX, INT64_MAX};
2093 InsrHelper(&masm, z1.VnD(), z1_inputs);
2094
2095 uint32_t z2_inputs[] = {0x12340042, 0, UINT32_MAX, 1, INT32_MAX, 0x80000000};
2096 InsrHelper(&masm, z2.VnS(), z2_inputs);
2097
2098 uint16_t z3_inputs[] = {0x122a, 0, 1, UINT16_MAX, 0x8000, INT16_MAX};
2099 InsrHelper(&masm, z3.VnH(), z3_inputs);
2100
2101 // The MacroAssembler implements non-destructive operations using movprfx.
2102 __ Uqdecp(z10.VnD(), p0, z0.VnD());
2103 __ Uqdecp(z11.VnD(), p0, z1.VnD());
2104 __ Uqdecp(z12.VnS(), p0, z2.VnS());
2105 __ Uqdecp(z13.VnH(), p0, z3.VnH());
2106
2107 __ Uqincp(z14.VnD(), p0, z0.VnD());
2108 __ Uqincp(z15.VnD(), p0, z1.VnD());
2109 __ Uqincp(z16.VnS(), p0, z2.VnS());
2110 __ Uqincp(z17.VnH(), p0, z3.VnH());
2111
2112 // Also test destructive forms.
2113 __ Mov(z4, z0);
2114 __ Mov(z5, z1);
2115 __ Mov(z6, z2);
2116 __ Mov(z7, z3);
2117
2118 __ Uqdecp(z0.VnD(), p0);
2119 __ Uqdecp(z1.VnD(), p0);
2120 __ Uqdecp(z2.VnS(), p0);
2121 __ Uqdecp(z3.VnH(), p0);
2122
2123 __ Uqincp(z4.VnD(), p0);
2124 __ Uqincp(z5.VnD(), p0);
2125 __ Uqincp(z6.VnS(), p0);
2126 __ Uqincp(z7.VnH(), p0);
2127
2128 END();
2129 if (CAN_RUN()) {
2130 RUN();
2131
2132 // z0_inputs[...] - number of active D lanes (2)
2133 uint64_t z0_expected[] = {0x1234567800000040, 0, 0, 0x7ffffffffffffffe};
2134 ASSERT_EQUAL_SVE(z0_expected, z0.VnD());
2135
2136 // z1_inputs[...] - number of active D lanes (2)
2137 uint64_t z1_expected[] = {0x12345678ffffff28,
2138 0,
2139 0xfffffffffffffffd,
2140 0x7ffffffffffffffd};
2141 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
2142
2143 // z2_inputs[...] - number of active S lanes (3)
2144 uint32_t z2_expected[] =
2145 {0x1234003f, 0, 0xfffffffc, 0, 0x7ffffffc, 0x7ffffffd};
2146 ASSERT_EQUAL_SVE(z2_expected, z2.VnS());
2147
2148 // z3_inputs[...] - number of active H lanes (5)
2149 uint16_t z3_expected[] = {0x1225, 0, 0, 0xfffa, 0x7ffb, 0x7ffa};
2150 ASSERT_EQUAL_SVE(z3_expected, z3.VnH());
2151
2152 // z0_inputs[...] + number of active D lanes (2)
2153 uint64_t z4_expected[] = {0x1234567800000044, 2, 3, 0x8000000000000002};
2154 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
2155
2156 // z1_inputs[...] + number of active D lanes (2)
2157 uint64_t z5_expected[] = {0x12345678ffffff2c,
2158 2,
2159 UINT64_MAX,
2160 0x8000000000000001};
2161 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
2162
2163 // z2_inputs[...] + number of active S lanes (3)
2164 uint32_t z6_expected[] =
2165 {0x12340045, 3, UINT32_MAX, 4, 0x80000002, 0x80000003};
2166 ASSERT_EQUAL_SVE(z6_expected, z6.VnS());
2167
2168 // z3_inputs[...] + number of active H lanes (5)
2169 uint16_t z7_expected[] = {0x122f, 5, 6, UINT16_MAX, 0x8005, 0x8004};
2170 ASSERT_EQUAL_SVE(z7_expected, z7.VnH());
2171
2172 // Check that the non-destructive macros produced the same results.
2173 ASSERT_EQUAL_SVE(z0_expected, z10.VnD());
2174 ASSERT_EQUAL_SVE(z1_expected, z11.VnD());
2175 ASSERT_EQUAL_SVE(z2_expected, z12.VnS());
2176 ASSERT_EQUAL_SVE(z3_expected, z13.VnH());
2177 ASSERT_EQUAL_SVE(z4_expected, z14.VnD());
2178 ASSERT_EQUAL_SVE(z5_expected, z15.VnD());
2179 ASSERT_EQUAL_SVE(z6_expected, z16.VnS());
2180 ASSERT_EQUAL_SVE(z7_expected, z17.VnH());
2181 }
2182}
2183
Jacob Bramleye8289202019-07-31 11:25:23 +01002184TEST_SVE(sve_uqinc_uqdec_ptrue_vector) {
2185 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002186 START();
2187
2188 // With an all-true predicate, these instructions increment or decrement by
2189 // the vector length.
Jacob Bramley0ce75842019-07-17 18:12:50 +01002190 __ Ptrue(p15.VnB());
Jacob Bramleyd1686cb2019-05-28 17:39:05 +01002191
2192 __ Mov(x0, 0x1234567800000000);
2193 __ Mov(x1, 0x12340000);
2194 __ Mov(x2, 0x1200);
2195
2196 __ Dup(z0.VnD(), x0);
2197 __ Uqdecp(z0.VnD(), p15);
2198
2199 __ Dup(z1.VnS(), x1);
2200 __ Uqdecp(z1.VnS(), p15);
2201
2202 __ Dup(z2.VnH(), x2);
2203 __ Uqdecp(z2.VnH(), p15);
2204
2205 __ Dup(z3.VnD(), x0);
2206 __ Uqincp(z3.VnD(), p15);
2207
2208 __ Dup(z4.VnS(), x1);
2209 __ Uqincp(z4.VnS(), p15);
2210
2211 __ Dup(z5.VnH(), x2);
2212 __ Uqincp(z5.VnH(), p15);
2213
2214 END();
2215 if (CAN_RUN()) {
2216 RUN();
2217
2218 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2219 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2220 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2221
2222 for (int i = 0; i < d_lane_count; i++) {
2223 ASSERT_EQUAL_SVE_LANE(0x1234567800000000 - d_lane_count, z0.VnD(), i);
2224 ASSERT_EQUAL_SVE_LANE(0x1234567800000000 + d_lane_count, z3.VnD(), i);
2225 }
2226
2227 for (int i = 0; i < s_lane_count; i++) {
2228 ASSERT_EQUAL_SVE_LANE(0x12340000 - s_lane_count, z1.VnS(), i);
2229 ASSERT_EQUAL_SVE_LANE(0x12340000 + s_lane_count, z4.VnS(), i);
2230 }
2231
2232 for (int i = 0; i < h_lane_count; i++) {
2233 ASSERT_EQUAL_SVE_LANE(0x1200 - h_lane_count, z2.VnH(), i);
2234 ASSERT_EQUAL_SVE_LANE(0x1200 + h_lane_count, z5.VnH(), i);
2235 }
2236 }
2237}
2238
Jacob Bramleye8289202019-07-31 11:25:23 +01002239TEST_SVE(sve_index) {
2240 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleycd8148c2019-07-11 18:43:20 +01002241 START();
2242
2243 // Simple cases.
2244 __ Index(z0.VnB(), 0, 1);
2245 __ Index(z1.VnH(), 1, 1);
2246 __ Index(z2.VnS(), 2, 1);
2247 __ Index(z3.VnD(), 3, 1);
2248
2249 // Synthesised immediates.
2250 __ Index(z4.VnB(), 42, -1);
2251 __ Index(z5.VnH(), -1, 42);
2252 __ Index(z6.VnS(), 42, 42);
2253
2254 // Register arguments.
2255 __ Mov(x0, 42);
2256 __ Mov(x1, -3);
2257 __ Index(z10.VnD(), x0, x1);
2258 __ Index(z11.VnB(), w0, w1);
2259 // The register size should correspond to the lane size, but VIXL allows any
2260 // register at least as big as the lane size.
2261 __ Index(z12.VnB(), x0, x1);
2262 __ Index(z13.VnH(), w0, x1);
2263 __ Index(z14.VnS(), x0, w1);
2264
2265 // Integer overflow.
2266 __ Index(z20.VnB(), UINT8_MAX - 2, 2);
2267 __ Index(z21.VnH(), 7, -3);
2268 __ Index(z22.VnS(), INT32_MAX - 2, 1);
2269 __ Index(z23.VnD(), INT64_MIN + 6, -7);
2270
2271 END();
2272
2273 if (CAN_RUN()) {
2274 RUN();
2275
2276 int b_lane_count = core.GetSVELaneCount(kBRegSize);
2277 int h_lane_count = core.GetSVELaneCount(kHRegSize);
2278 int s_lane_count = core.GetSVELaneCount(kSRegSize);
2279 int d_lane_count = core.GetSVELaneCount(kDRegSize);
2280
2281 uint64_t b_mask = GetUintMask(kBRegSize);
2282 uint64_t h_mask = GetUintMask(kHRegSize);
2283 uint64_t s_mask = GetUintMask(kSRegSize);
2284 uint64_t d_mask = GetUintMask(kDRegSize);
2285
2286 // Simple cases.
2287 for (int i = 0; i < b_lane_count; i++) {
2288 ASSERT_EQUAL_SVE_LANE((0 + i) & b_mask, z0.VnB(), i);
2289 }
2290 for (int i = 0; i < h_lane_count; i++) {
2291 ASSERT_EQUAL_SVE_LANE((1 + i) & h_mask, z1.VnH(), i);
2292 }
2293 for (int i = 0; i < s_lane_count; i++) {
2294 ASSERT_EQUAL_SVE_LANE((2 + i) & s_mask, z2.VnS(), i);
2295 }
2296 for (int i = 0; i < d_lane_count; i++) {
2297 ASSERT_EQUAL_SVE_LANE((3 + i) & d_mask, z3.VnD(), i);
2298 }
2299
2300 // Synthesised immediates.
2301 for (int i = 0; i < b_lane_count; i++) {
2302 ASSERT_EQUAL_SVE_LANE((42 - i) & b_mask, z4.VnB(), i);
2303 }
2304 for (int i = 0; i < h_lane_count; i++) {
2305 ASSERT_EQUAL_SVE_LANE((-1 + (42 * i)) & h_mask, z5.VnH(), i);
2306 }
2307 for (int i = 0; i < s_lane_count; i++) {
2308 ASSERT_EQUAL_SVE_LANE((42 + (42 * i)) & s_mask, z6.VnS(), i);
2309 }
2310
2311 // Register arguments.
2312 for (int i = 0; i < d_lane_count; i++) {
2313 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & d_mask, z10.VnD(), i);
2314 }
2315 for (int i = 0; i < b_lane_count; i++) {
2316 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z11.VnB(), i);
2317 }
2318 for (int i = 0; i < b_lane_count; i++) {
2319 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & b_mask, z12.VnB(), i);
2320 }
2321 for (int i = 0; i < h_lane_count; i++) {
2322 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & h_mask, z13.VnH(), i);
2323 }
2324 for (int i = 0; i < s_lane_count; i++) {
2325 ASSERT_EQUAL_SVE_LANE((42 - (3 * i)) & s_mask, z14.VnS(), i);
2326 }
2327
2328 // Integer overflow.
2329 uint8_t expected_z20[] = {0x05, 0x03, 0x01, 0xff, 0xfd};
2330 ASSERT_EQUAL_SVE(expected_z20, z20.VnB());
2331 uint16_t expected_z21[] = {0xfffb, 0xfffe, 0x0001, 0x0004, 0x0007};
2332 ASSERT_EQUAL_SVE(expected_z21, z21.VnH());
2333 uint32_t expected_z22[] = {0x80000000, 0x7fffffff, 0x7ffffffe, 0x7ffffffd};
2334 ASSERT_EQUAL_SVE(expected_z22, z22.VnS());
2335 uint64_t expected_z23[] = {0x7fffffffffffffff, 0x8000000000000006};
2336 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
2337 }
2338}
2339
TatWai Chongc844bb22019-06-10 15:32:53 -07002340TEST(sve_int_compare_count_and_limit_scalars) {
2341 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2342 START();
2343
2344 __ Mov(w20, 0xfffffffd);
2345 __ Mov(w21, 0xffffffff);
2346
2347 __ Whilele(p0.VnB(), w20, w21);
2348 __ Mrs(x0, NZCV);
2349 __ Whilele(p1.VnH(), w20, w21);
2350 __ Mrs(x1, NZCV);
2351
2352 __ Mov(w20, 0xffffffff);
2353 __ Mov(w21, 0x00000000);
2354
2355 __ Whilelt(p2.VnS(), w20, w21);
2356 __ Mrs(x2, NZCV);
2357 __ Whilelt(p3.VnD(), w20, w21);
2358 __ Mrs(x3, NZCV);
2359
2360 __ Mov(w20, 0xfffffffd);
2361 __ Mov(w21, 0xffffffff);
2362
2363 __ Whilels(p4.VnB(), w20, w21);
2364 __ Mrs(x4, NZCV);
2365 __ Whilels(p5.VnH(), w20, w21);
2366 __ Mrs(x5, NZCV);
2367
2368 __ Mov(w20, 0xffffffff);
2369 __ Mov(w21, 0x00000000);
2370
2371 __ Whilelo(p6.VnS(), w20, w21);
2372 __ Mrs(x6, NZCV);
2373 __ Whilelo(p7.VnD(), w20, w21);
2374 __ Mrs(x7, NZCV);
2375
2376 __ Mov(x20, 0xfffffffffffffffd);
2377 __ Mov(x21, 0xffffffffffffffff);
2378
2379 __ Whilele(p8.VnB(), x20, x21);
2380 __ Mrs(x8, NZCV);
2381 __ Whilele(p9.VnH(), x20, x21);
2382 __ Mrs(x9, NZCV);
2383
2384 __ Mov(x20, 0xffffffffffffffff);
2385 __ Mov(x21, 0x0000000000000000);
2386
2387 __ Whilelt(p10.VnS(), x20, x21);
2388 __ Mrs(x10, NZCV);
2389 __ Whilelt(p11.VnD(), x20, x21);
2390 __ Mrs(x11, NZCV);
2391
2392 __ Mov(x20, 0xfffffffffffffffd);
2393 __ Mov(x21, 0xffffffffffffffff);
2394
2395 __ Whilels(p12.VnB(), x20, x21);
2396 __ Mrs(x12, NZCV);
2397 __ Whilels(p13.VnH(), x20, x21);
2398 __ Mrs(x13, NZCV);
2399
2400 __ Mov(x20, 0xffffffffffffffff);
2401 __ Mov(x21, 0x0000000000000000);
2402
2403 __ Whilelo(p14.VnS(), x20, x21);
2404 __ Mrs(x14, NZCV);
2405 __ Whilelo(p15.VnD(), x20, x21);
2406 __ Mrs(x15, NZCV);
2407
2408 END();
2409
2410 if (CAN_RUN()) {
2411 RUN();
2412
2413 // 0b...00000000'00000111
2414 int p0_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2415 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
2416
2417 // 0b...00000000'00010101
2418 int p1_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2419 ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
2420
2421 int p2_expected[] = {0x0, 0x0, 0x0, 0x1};
2422 ASSERT_EQUAL_SVE(p2_expected, p2.VnS());
2423
2424 int p3_expected[] = {0x00, 0x01};
2425 ASSERT_EQUAL_SVE(p3_expected, p3.VnD());
2426
2427 // 0b...11111111'11111111
2428 int p4_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2429 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
2430
2431 // 0b...01010101'01010101
2432 int p5_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2433 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2434
2435 int p6_expected[] = {0x0, 0x0, 0x0, 0x0};
2436 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2437
2438 int p7_expected[] = {0x00, 0x00};
2439 ASSERT_EQUAL_SVE(p7_expected, p7.VnD());
2440
2441 // 0b...00000000'00000111
2442 int p8_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
2443 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
2444
2445 // 0b...00000000'00010101
2446 int p9_expected[] = {0, 0, 0, 0, 0, 1, 1, 1};
2447 ASSERT_EQUAL_SVE(p9_expected, p9.VnH());
2448
2449 int p10_expected[] = {0x0, 0x0, 0x0, 0x1};
2450 ASSERT_EQUAL_SVE(p10_expected, p10.VnS());
2451
2452 int p11_expected[] = {0x00, 0x01};
2453 ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2454
2455 // 0b...11111111'11111111
2456 int p12_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2457 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
2458
2459 // 0b...01010101'01010101
2460 int p13_expected[] = {1, 1, 1, 1, 1, 1, 1, 1};
2461 ASSERT_EQUAL_SVE(p13_expected, p13.VnH());
2462
2463 int p14_expected[] = {0x0, 0x0, 0x0, 0x0};
2464 ASSERT_EQUAL_SVE(p14_expected, p14.VnS());
2465
2466 int p15_expected[] = {0x00, 0x00};
2467 ASSERT_EQUAL_SVE(p15_expected, p15.VnD());
2468
2469 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w0);
2470 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w1);
2471 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w2);
2472 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w3);
2473 ASSERT_EQUAL_32(SVEFirstFlag, w4);
2474 ASSERT_EQUAL_32(SVEFirstFlag, w5);
2475 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w6);
2476 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w7);
2477 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w8);
2478 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w9);
2479 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w10);
2480 ASSERT_EQUAL_32(SVEFirstFlag | SVENotLastFlag, w11);
2481 ASSERT_EQUAL_32(SVEFirstFlag, w12);
2482 ASSERT_EQUAL_32(SVEFirstFlag, w13);
2483 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w14);
2484 ASSERT_EQUAL_32(SVENoneFlag | SVENotLastFlag, w15);
2485 }
2486}
2487
Martyn Capewell78de9512020-10-28 14:55:49 +00002488TEST(sve_int_compare_count_and_limit_scalars_regression_test) {
2489 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2490 START();
2491
2492 __ Mov(w0, 0x7ffffffd);
2493 __ Mov(w1, 0x7fffffff);
2494 __ Whilele(p0.VnB(), w0, w1);
2495
2496 END();
2497
2498 if (CAN_RUN()) {
2499 RUN();
2500
2501 int p0_expected[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2502 ASSERT_EQUAL_SVE(p0_expected, p0.VnB());
2503 }
2504}
2505
TatWai Chong302729c2019-06-14 16:18:51 -07002506TEST(sve_int_compare_vectors_signed_imm) {
2507 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2508 START();
2509
2510 int z13_inputs[] = {0, 1, -1, -15, 126, -127, -126, -15};
2511 int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 1, 1};
2512 InsrHelper(&masm, z13.VnB(), z13_inputs);
2513 Initialise(&masm, p0.VnB(), mask_inputs1);
2514
2515 __ Cmpeq(p2.VnB(), p0.Zeroing(), z13.VnB(), -15);
2516 __ Mrs(x2, NZCV);
2517 __ Cmpeq(p3.VnB(), p0.Zeroing(), z13.VnB(), -127);
2518
2519 int z14_inputs[] = {0, 1, -1, -32767, -32766, 32767, 32766, 0};
2520 int mask_inputs2[] = {1, 1, 1, 0, 1, 1, 1, 1};
2521 InsrHelper(&masm, z14.VnH(), z14_inputs);
2522 Initialise(&masm, p0.VnH(), mask_inputs2);
2523
2524 __ Cmpge(p4.VnH(), p0.Zeroing(), z14.VnH(), -1);
2525 __ Mrs(x4, NZCV);
2526 __ Cmpge(p5.VnH(), p0.Zeroing(), z14.VnH(), -32767);
2527
2528 int z15_inputs[] = {0, 1, -1, INT_MIN};
2529 int mask_inputs3[] = {0, 1, 1, 1};
2530 InsrHelper(&masm, z15.VnS(), z15_inputs);
2531 Initialise(&masm, p0.VnS(), mask_inputs3);
2532
2533 __ Cmpgt(p6.VnS(), p0.Zeroing(), z15.VnS(), 0);
2534 __ Mrs(x6, NZCV);
2535 __ Cmpgt(p7.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2536
2537 __ Cmplt(p8.VnS(), p0.Zeroing(), z15.VnS(), 0);
2538 __ Mrs(x8, NZCV);
2539 __ Cmplt(p9.VnS(), p0.Zeroing(), z15.VnS(), INT_MIN + 1);
2540
2541 int64_t z16_inputs[] = {0, -1};
2542 int mask_inputs4[] = {1, 1};
2543 InsrHelper(&masm, z16.VnD(), z16_inputs);
2544 Initialise(&masm, p0.VnD(), mask_inputs4);
2545
2546 __ Cmple(p10.VnD(), p0.Zeroing(), z16.VnD(), -1);
2547 __ Mrs(x10, NZCV);
2548 __ Cmple(p11.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MIN);
2549
2550 __ Cmpne(p12.VnD(), p0.Zeroing(), z16.VnD(), -1);
2551 __ Mrs(x12, NZCV);
2552 __ Cmpne(p13.VnD(), p0.Zeroing(), z16.VnD(), LLONG_MAX);
2553
2554 END();
2555
2556 if (CAN_RUN()) {
2557 RUN();
2558
2559 int p2_expected[] = {0, 0, 0, 0, 0, 0, 0, 1};
2560 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2561
2562 int p3_expected[] = {0, 0, 0, 0, 0, 1, 0, 0};
2563 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2564
2565 int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1, 0x1};
2566 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2567
2568 int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1};
2569 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2570
2571 int p6_expected[] = {0x0, 0x1, 0x0, 0x0};
2572 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2573
2574 int p7_expected[] = {0x0, 0x1, 0x1, 0x0};
2575 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2576
2577 int p8_expected[] = {0x0, 0x0, 0x1, 0x1};
2578 ASSERT_EQUAL_SVE(p8_expected, p8.VnS());
2579
2580 int p9_expected[] = {0x0, 0x0, 0x0, 0x1};
2581 ASSERT_EQUAL_SVE(p9_expected, p9.VnS());
2582
2583 int p10_expected[] = {0x00, 0x01};
2584 ASSERT_EQUAL_SVE(p10_expected, p10.VnD());
2585
2586 int p11_expected[] = {0x00, 0x00};
2587 ASSERT_EQUAL_SVE(p11_expected, p11.VnD());
2588
2589 int p12_expected[] = {0x01, 0x00};
2590 ASSERT_EQUAL_SVE(p12_expected, p12.VnD());
2591
2592 int p13_expected[] = {0x01, 0x01};
2593 ASSERT_EQUAL_SVE(p13_expected, p13.VnD());
2594
2595 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w2);
2596 ASSERT_EQUAL_32(SVEFirstFlag, w4);
2597 ASSERT_EQUAL_32(NoFlag, w6);
2598 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2599 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w10);
2600 ASSERT_EQUAL_32(NoFlag, w12);
2601 }
2602}
2603
2604TEST(sve_int_compare_vectors_unsigned_imm) {
2605 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2606 START();
2607
2608 uint32_t src1_inputs[] = {0xf7, 0x0f, 0x8f, 0x1f, 0x83, 0x12, 0x00, 0xf1};
2609 int mask_inputs1[] = {1, 1, 1, 0, 1, 1, 0, 1};
2610 InsrHelper(&masm, z13.VnB(), src1_inputs);
2611 Initialise(&masm, p0.VnB(), mask_inputs1);
2612
2613 __ Cmphi(p2.VnB(), p0.Zeroing(), z13.VnB(), 0x0f);
2614 __ Mrs(x2, NZCV);
2615 __ Cmphi(p3.VnB(), p0.Zeroing(), z13.VnB(), 0xf0);
2616
2617 uint32_t src2_inputs[] = {0xffff, 0x8000, 0x1fff, 0x0000, 0x1234};
2618 int mask_inputs2[] = {1, 1, 1, 1, 0};
2619 InsrHelper(&masm, z13.VnH(), src2_inputs);
2620 Initialise(&masm, p0.VnH(), mask_inputs2);
2621
2622 __ Cmphs(p4.VnH(), p0.Zeroing(), z13.VnH(), 0x1f);
2623 __ Mrs(x4, NZCV);
2624 __ Cmphs(p5.VnH(), p0.Zeroing(), z13.VnH(), 0x1fff);
2625
2626 uint32_t src3_inputs[] = {0xffffffff, 0xfedcba98, 0x0000ffff, 0x00000000};
2627 int mask_inputs3[] = {1, 1, 1, 1};
2628 InsrHelper(&masm, z13.VnS(), src3_inputs);
2629 Initialise(&masm, p0.VnS(), mask_inputs3);
2630
2631 __ Cmplo(p6.VnS(), p0.Zeroing(), z13.VnS(), 0x3f);
2632 __ Mrs(x6, NZCV);
2633 __ Cmplo(p7.VnS(), p0.Zeroing(), z13.VnS(), 0x3f3f3f3f);
2634
2635 uint64_t src4_inputs[] = {0xffffffffffffffff, 0x0000000000000000};
2636 int mask_inputs4[] = {1, 1};
2637 InsrHelper(&masm, z13.VnD(), src4_inputs);
2638 Initialise(&masm, p0.VnD(), mask_inputs4);
2639
2640 __ Cmpls(p8.VnD(), p0.Zeroing(), z13.VnD(), 0x2f);
2641 __ Mrs(x8, NZCV);
2642 __ Cmpls(p9.VnD(), p0.Zeroing(), z13.VnD(), 0x800000000000000);
2643
2644 END();
2645
2646 if (CAN_RUN()) {
2647 RUN();
2648
2649 int p2_expected[] = {1, 0, 1, 0, 1, 1, 0, 1};
2650 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
2651
2652 int p3_expected[] = {1, 0, 0, 0, 0, 0, 0, 1};
2653 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
2654
2655 int p4_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2656 ASSERT_EQUAL_SVE(p4_expected, p4.VnH());
2657
2658 int p5_expected[] = {0x1, 0x1, 0x1, 0x0, 0x0};
2659 ASSERT_EQUAL_SVE(p5_expected, p5.VnH());
2660
2661 int p6_expected[] = {0x0, 0x0, 0x0, 0x1};
2662 ASSERT_EQUAL_SVE(p6_expected, p6.VnS());
2663
2664 int p7_expected[] = {0x0, 0x0, 0x1, 0x1};
2665 ASSERT_EQUAL_SVE(p7_expected, p7.VnS());
2666
2667 int p8_expected[] = {0x00, 0x01};
2668 ASSERT_EQUAL_SVE(p8_expected, p8.VnD());
2669
2670 int p9_expected[] = {0x00, 0x01};
2671 ASSERT_EQUAL_SVE(p9_expected, p9.VnD());
2672
2673 ASSERT_EQUAL_32(SVEFirstFlag, w2);
2674 ASSERT_EQUAL_32(NoFlag, w4);
2675 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w6);
2676 ASSERT_EQUAL_32(SVENotLastFlag | SVEFirstFlag, w8);
2677 }
2678}
2679
TatWai Chongc844bb22019-06-10 15:32:53 -07002680TEST(sve_int_compare_conditionally_terminate_scalars) {
2681 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
2682 START();
2683
2684 __ Mov(x0, 0xfedcba9887654321);
2685 __ Mov(x1, 0x1000100010001000);
2686
Jacob Bramleyb40aa692019-10-07 19:24:29 +01002687 // Initialise Z and C. These are preserved by cterm*, and the V flag is set to
2688 // !C if the condition does not hold.
2689 __ Mov(x10, NoFlag);
2690 __ Msr(NZCV, x10);
2691
TatWai Chongc844bb22019-06-10 15:32:53 -07002692 __ Ctermeq(w0, w0);
2693 __ Mrs(x2, NZCV);
2694 __ Ctermeq(x0, x1);
2695 __ Mrs(x3, NZCV);
2696 __ Ctermne(x0, x0);
2697 __ Mrs(x4, NZCV);
2698 __ Ctermne(w0, w1);
2699 __ Mrs(x5, NZCV);
2700
Jacob Bramleyb40aa692019-10-07 19:24:29 +01002701 // As above, but with all flags initially set.
2702 __ Mov(x10, NZCVFlag);
2703 __ Msr(NZCV, x10);
2704
2705 __ Ctermeq(w0, w0);
2706 __ Mrs(x6, NZCV);
2707 __ Ctermeq(x0, x1);
2708 __ Mrs(x7, NZCV);
2709 __ Ctermne(x0, x0);
2710 __ Mrs(x8, NZCV);
2711 __ Ctermne(w0, w1);
2712 __ Mrs(x9, NZCV);
2713
TatWai Chongc844bb22019-06-10 15:32:53 -07002714 END();
2715
2716 if (CAN_RUN()) {
2717 RUN();
2718
2719 ASSERT_EQUAL_32(SVEFirstFlag, w2);
2720 ASSERT_EQUAL_32(VFlag, w3);
2721 ASSERT_EQUAL_32(VFlag, w4);
2722 ASSERT_EQUAL_32(SVEFirstFlag, w5);
Jacob Bramleyb40aa692019-10-07 19:24:29 +01002723
2724 ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w6);
2725 ASSERT_EQUAL_32(ZCFlag, w7);
2726 ASSERT_EQUAL_32(ZCFlag, w8);
2727 ASSERT_EQUAL_32(SVEFirstFlag | ZCFlag, w9);
TatWai Chongc844bb22019-06-10 15:32:53 -07002728 }
2729}
2730
Jacob Bramley0ce75842019-07-17 18:12:50 +01002731// Work out what the architectural `PredTest` pseudocode should produce for the
2732// given result and governing predicate.
2733template <typename Tg, typename Td, int N>
2734static StatusFlags GetPredTestFlags(const Td (&pd)[N],
2735 const Tg (&pg)[N],
2736 int vl) {
2737 int first = -1;
2738 int last = -1;
2739 bool any_active = false;
2740
2741 // Only consider potentially-active lanes.
2742 int start = (N > vl) ? (N - vl) : 0;
2743 for (int i = start; i < N; i++) {
2744 if ((pg[i] & 1) == 1) {
2745 // Look for the first and last active lanes.
2746 // Note that the 'first' lane is the one with the highest index.
2747 if (last < 0) last = i;
2748 first = i;
2749 // Look for any active lanes that are also active in pd.
2750 if ((pd[i] & 1) == 1) any_active = true;
2751 }
2752 }
2753
2754 uint32_t flags = 0;
2755 if ((first >= 0) && ((pd[first] & 1) == 1)) flags |= SVEFirstFlag;
2756 if (!any_active) flags |= SVENoneFlag;
2757 if ((last < 0) || ((pd[last] & 1) == 0)) flags |= SVENotLastFlag;
2758 return static_cast<StatusFlags>(flags);
2759}
2760
2761typedef void (MacroAssembler::*PfirstPnextFn)(const PRegisterWithLaneSize& pd,
2762 const PRegister& pg,
2763 const PRegisterWithLaneSize& pn);
2764template <typename Tg, typename Tn, typename Td>
Jacob Bramleye8289202019-07-31 11:25:23 +01002765static void PfirstPnextHelper(Test* config,
2766 PfirstPnextFn macro,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002767 unsigned lane_size_in_bits,
2768 const Tg& pg_inputs,
2769 const Tn& pn_inputs,
2770 const Td& pd_expected) {
Jacob Bramleye8289202019-07-31 11:25:23 +01002771 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002772 START();
2773
2774 PRegister pg = p15;
2775 PRegister pn = p14;
2776 Initialise(&masm, pg.WithLaneSize(lane_size_in_bits), pg_inputs);
2777 Initialise(&masm, pn.WithLaneSize(lane_size_in_bits), pn_inputs);
2778
2779 // Initialise NZCV to an impossible value, to check that we actually write it.
2780 __ Mov(x10, NZCVFlag);
2781
2782 // If pd.Is(pn), the MacroAssembler simply passes the arguments directly to
2783 // the Assembler.
2784 __ Msr(NZCV, x10);
2785 __ Mov(p0, pn);
2786 (masm.*macro)(p0.WithLaneSize(lane_size_in_bits),
2787 pg,
2788 p0.WithLaneSize(lane_size_in_bits));
2789 __ Mrs(x0, NZCV);
2790
2791 // The MacroAssembler supports non-destructive use.
2792 __ Msr(NZCV, x10);
2793 (masm.*macro)(p1.WithLaneSize(lane_size_in_bits),
2794 pg,
2795 pn.WithLaneSize(lane_size_in_bits));
2796 __ Mrs(x1, NZCV);
2797
2798 // If pd.Aliases(pg) the macro requires a scratch register.
2799 {
2800 UseScratchRegisterScope temps(&masm);
2801 temps.Include(p13);
2802 __ Msr(NZCV, x10);
2803 __ Mov(p2, p15);
2804 (masm.*macro)(p2.WithLaneSize(lane_size_in_bits),
2805 p2,
2806 pn.WithLaneSize(lane_size_in_bits));
2807 __ Mrs(x2, NZCV);
2808 }
2809
2810 END();
2811
2812 if (CAN_RUN()) {
2813 RUN();
2814
2815 // Check that the inputs weren't modified.
2816 ASSERT_EQUAL_SVE(pn_inputs, pn.WithLaneSize(lane_size_in_bits));
2817 ASSERT_EQUAL_SVE(pg_inputs, pg.WithLaneSize(lane_size_in_bits));
2818
2819 // Check the primary operation.
2820 ASSERT_EQUAL_SVE(pd_expected, p0.WithLaneSize(lane_size_in_bits));
2821 ASSERT_EQUAL_SVE(pd_expected, p1.WithLaneSize(lane_size_in_bits));
2822 ASSERT_EQUAL_SVE(pd_expected, p2.WithLaneSize(lane_size_in_bits));
2823
2824 // Check that the flags were properly set.
2825 StatusFlags nzcv_expected =
2826 GetPredTestFlags(pd_expected,
2827 pg_inputs,
2828 core.GetSVELaneCount(kBRegSize));
2829 ASSERT_EQUAL_64(nzcv_expected, x0);
2830 ASSERT_EQUAL_64(nzcv_expected, x1);
2831 ASSERT_EQUAL_64(nzcv_expected, x2);
2832 }
2833}
2834
2835template <typename Tg, typename Tn, typename Td>
Jacob Bramleye8289202019-07-31 11:25:23 +01002836static void PfirstHelper(Test* config,
2837 const Tg& pg_inputs,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002838 const Tn& pn_inputs,
2839 const Td& pd_expected) {
Jacob Bramleye8289202019-07-31 11:25:23 +01002840 PfirstPnextHelper(config,
2841 &MacroAssembler::Pfirst,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002842 kBRegSize, // pfirst only accepts B-sized lanes.
2843 pg_inputs,
2844 pn_inputs,
2845 pd_expected);
2846}
2847
2848template <typename Tg, typename Tn, typename Td>
Jacob Bramleye8289202019-07-31 11:25:23 +01002849static void PnextHelper(Test* config,
2850 unsigned lane_size_in_bits,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002851 const Tg& pg_inputs,
2852 const Tn& pn_inputs,
2853 const Td& pd_expected) {
Jacob Bramleye8289202019-07-31 11:25:23 +01002854 PfirstPnextHelper(config,
2855 &MacroAssembler::Pnext,
Jacob Bramley0ce75842019-07-17 18:12:50 +01002856 lane_size_in_bits,
2857 pg_inputs,
2858 pn_inputs,
2859 pd_expected);
2860}
2861
Jacob Bramleye8289202019-07-31 11:25:23 +01002862TEST_SVE(sve_pfirst) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01002863 // Provide more lanes than kPRegMinSize (to check propagation if we have a
2864 // large VL), but few enough to make the test easy to read.
2865 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2866 int in1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2867 int in2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2868 int in3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2869 int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2870 VIXL_ASSERT(ArrayLength(in0) > kPRegMinSize);
2871
2872 // Pfirst finds the first active lane in pg, and activates the corresponding
2873 // lane in pn (if it isn't already active).
2874
2875 // The first active lane in in1 is here. |
2876 // v
2877 int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
2878 int exp12[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0};
2879 int exp13[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2880 int exp14[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
Jacob Bramleye8289202019-07-31 11:25:23 +01002881 PfirstHelper(config, in1, in0, exp10);
2882 PfirstHelper(config, in1, in2, exp12);
2883 PfirstHelper(config, in1, in3, exp13);
2884 PfirstHelper(config, in1, in4, exp14);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002885
2886 // The first active lane in in2 is here. |
2887 // v
2888 int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
2889 int exp21[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0};
2890 int exp23[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
2891 int exp24[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
Jacob Bramleye8289202019-07-31 11:25:23 +01002892 PfirstHelper(config, in2, in0, exp20);
2893 PfirstHelper(config, in2, in1, exp21);
2894 PfirstHelper(config, in2, in3, exp23);
2895 PfirstHelper(config, in2, in4, exp24);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002896
2897 // The first active lane in in3 is here. |
2898 // v
2899 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
2900 int exp31[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1};
2901 int exp32[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
2902 int exp34[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
Jacob Bramleye8289202019-07-31 11:25:23 +01002903 PfirstHelper(config, in3, in0, exp30);
2904 PfirstHelper(config, in3, in1, exp31);
2905 PfirstHelper(config, in3, in2, exp32);
2906 PfirstHelper(config, in3, in4, exp34);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002907
2908 // | The first active lane in in4 is here.
2909 // v
2910 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2911 int exp41[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2912 int exp42[] = {1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2913 int exp43[] = {1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
Jacob Bramleye8289202019-07-31 11:25:23 +01002914 PfirstHelper(config, in4, in0, exp40);
2915 PfirstHelper(config, in4, in1, exp41);
2916 PfirstHelper(config, in4, in2, exp42);
2917 PfirstHelper(config, in4, in3, exp43);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002918
2919 // If pg is all inactive, the input is passed through unchanged.
Jacob Bramleye8289202019-07-31 11:25:23 +01002920 PfirstHelper(config, in0, in0, in0);
2921 PfirstHelper(config, in0, in1, in1);
2922 PfirstHelper(config, in0, in2, in2);
2923 PfirstHelper(config, in0, in3, in3);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002924
2925 // If the values of pg and pn match, the value is passed through unchanged.
Jacob Bramleye8289202019-07-31 11:25:23 +01002926 PfirstHelper(config, in0, in0, in0);
2927 PfirstHelper(config, in1, in1, in1);
2928 PfirstHelper(config, in2, in2, in2);
2929 PfirstHelper(config, in3, in3, in3);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002930}
2931
Jacob Bramleye8289202019-07-31 11:25:23 +01002932TEST_SVE(sve_pfirst_alias) {
2933 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01002934 START();
2935
2936 // Check that the Simulator behaves correctly when all arguments are aliased.
2937 int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
2938 int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
2939 int in_s[] = {0, 1, 1, 0};
2940 int in_d[] = {1, 1};
2941
2942 Initialise(&masm, p0.VnB(), in_b);
2943 Initialise(&masm, p1.VnH(), in_h);
2944 Initialise(&masm, p2.VnS(), in_s);
2945 Initialise(&masm, p3.VnD(), in_d);
2946
2947 // Initialise NZCV to an impossible value, to check that we actually write it.
2948 __ Mov(x10, NZCVFlag);
2949
2950 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01002951 __ Pfirst(p0.VnB(), p0, p0.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01002952 __ Mrs(x0, NZCV);
2953
2954 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01002955 __ Pfirst(p1.VnB(), p1, p1.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01002956 __ Mrs(x1, NZCV);
2957
2958 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01002959 __ Pfirst(p2.VnB(), p2, p2.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01002960 __ Mrs(x2, NZCV);
2961
2962 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01002963 __ Pfirst(p3.VnB(), p3, p3.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01002964 __ Mrs(x3, NZCV);
2965
2966 END();
2967
2968 if (CAN_RUN()) {
2969 RUN();
2970
2971 // The first lane from pg is already active in pdn, so the P register should
2972 // be unchanged.
2973 ASSERT_EQUAL_SVE(in_b, p0.VnB());
2974 ASSERT_EQUAL_SVE(in_h, p1.VnH());
2975 ASSERT_EQUAL_SVE(in_s, p2.VnS());
2976 ASSERT_EQUAL_SVE(in_d, p3.VnD());
2977
2978 ASSERT_EQUAL_64(SVEFirstFlag, x0);
2979 ASSERT_EQUAL_64(SVEFirstFlag, x1);
2980 ASSERT_EQUAL_64(SVEFirstFlag, x2);
2981 ASSERT_EQUAL_64(SVEFirstFlag, x3);
2982 }
2983}
2984
Jacob Bramleye8289202019-07-31 11:25:23 +01002985TEST_SVE(sve_pnext_b) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01002986 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
2987 // (to check propagation if we have a large VL), but few enough to make the
2988 // test easy to read.
2989 // For now, we just use kPRegMinSize so that the test works anywhere.
2990 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2991 int in1[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
2992 int in2[] = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
2993 int in3[] = {0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1};
2994 int in4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
2995
2996 // Pnext activates the next element that is true in pg, after the last-active
2997 // element in pn. If all pn elements are false (as in in0), it starts looking
2998 // at element 0.
2999
3000 // There are no active lanes in in0, so the result is simply the first active
3001 // lane from pg.
3002 int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3003 int exp10[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
3004 int exp20[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0};
3005 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
3006 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3007
3008 // The last active lane in in1 is here. |
3009 // v
3010 int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3011 int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3012 int exp21[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3013 int exp31[] = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3014 int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3015
3016 // | The last active lane in in2 is here.
3017 // v
3018 int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3019 int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3020 int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3021 int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3022 int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3023
3024 // | The last active lane in in3 is here.
3025 // v
3026 int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3027 int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3028 int exp23[] = {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3029 int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3030 int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3031
3032 // | The last active lane in in4 is here.
3033 // v
3034 int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3035 int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3036 int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3037 int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3038 int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
3039
Jacob Bramleye8289202019-07-31 11:25:23 +01003040 PnextHelper(config, kBRegSize, in0, in0, exp00);
3041 PnextHelper(config, kBRegSize, in1, in0, exp10);
3042 PnextHelper(config, kBRegSize, in2, in0, exp20);
3043 PnextHelper(config, kBRegSize, in3, in0, exp30);
3044 PnextHelper(config, kBRegSize, in4, in0, exp40);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003045
Jacob Bramleye8289202019-07-31 11:25:23 +01003046 PnextHelper(config, kBRegSize, in0, in1, exp01);
3047 PnextHelper(config, kBRegSize, in1, in1, exp11);
3048 PnextHelper(config, kBRegSize, in2, in1, exp21);
3049 PnextHelper(config, kBRegSize, in3, in1, exp31);
3050 PnextHelper(config, kBRegSize, in4, in1, exp41);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003051
Jacob Bramleye8289202019-07-31 11:25:23 +01003052 PnextHelper(config, kBRegSize, in0, in2, exp02);
3053 PnextHelper(config, kBRegSize, in1, in2, exp12);
3054 PnextHelper(config, kBRegSize, in2, in2, exp22);
3055 PnextHelper(config, kBRegSize, in3, in2, exp32);
3056 PnextHelper(config, kBRegSize, in4, in2, exp42);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003057
Jacob Bramleye8289202019-07-31 11:25:23 +01003058 PnextHelper(config, kBRegSize, in0, in3, exp03);
3059 PnextHelper(config, kBRegSize, in1, in3, exp13);
3060 PnextHelper(config, kBRegSize, in2, in3, exp23);
3061 PnextHelper(config, kBRegSize, in3, in3, exp33);
3062 PnextHelper(config, kBRegSize, in4, in3, exp43);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003063
Jacob Bramleye8289202019-07-31 11:25:23 +01003064 PnextHelper(config, kBRegSize, in0, in4, exp04);
3065 PnextHelper(config, kBRegSize, in1, in4, exp14);
3066 PnextHelper(config, kBRegSize, in2, in4, exp24);
3067 PnextHelper(config, kBRegSize, in3, in4, exp34);
3068 PnextHelper(config, kBRegSize, in4, in4, exp44);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003069}
3070
Jacob Bramleye8289202019-07-31 11:25:23 +01003071TEST_SVE(sve_pnext_h) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003072 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3073 // (to check propagation if we have a large VL), but few enough to make the
3074 // test easy to read.
3075 // For now, we just use kPRegMinSize so that the test works anywhere.
3076 int in0[] = {0, 0, 0, 0, 0, 0, 0, 0};
3077 int in1[] = {0, 0, 0, 1, 0, 2, 1, 0};
3078 int in2[] = {0, 1, 2, 0, 2, 0, 2, 0};
3079 int in3[] = {0, 0, 0, 3, 0, 0, 0, 3};
3080 int in4[] = {3, 0, 0, 0, 0, 0, 0, 0};
3081
3082 // Pnext activates the next element that is true in pg, after the last-active
3083 // element in pn. If all pn elements are false (as in in0), it starts looking
3084 // at element 0.
3085 //
3086 // As for other SVE instructions, elements are only considered to be active if
3087 // the _first_ bit in each field is one. Other bits are ignored.
3088
3089 // There are no active lanes in in0, so the result is simply the first active
3090 // lane from pg.
3091 int exp00[] = {0, 0, 0, 0, 0, 0, 0, 0};
3092 int exp10[] = {0, 0, 0, 0, 0, 0, 1, 0};
3093 int exp20[] = {0, 1, 0, 0, 0, 0, 0, 0};
3094 int exp30[] = {0, 0, 0, 0, 0, 0, 0, 1};
3095 int exp40[] = {1, 0, 0, 0, 0, 0, 0, 0};
3096
3097 // | The last active lane in in1 is here.
3098 // v
3099 int exp01[] = {0, 0, 0, 0, 0, 0, 0, 0};
3100 int exp11[] = {0, 0, 0, 0, 0, 0, 0, 0};
3101 int exp21[] = {0, 1, 0, 0, 0, 0, 0, 0};
3102 int exp31[] = {0, 0, 0, 0, 0, 0, 0, 0};
3103 int exp41[] = {1, 0, 0, 0, 0, 0, 0, 0};
3104
3105 // | The last active lane in in2 is here.
3106 // v
3107 int exp02[] = {0, 0, 0, 0, 0, 0, 0, 0};
3108 int exp12[] = {0, 0, 0, 0, 0, 0, 0, 0};
3109 int exp22[] = {0, 0, 0, 0, 0, 0, 0, 0};
3110 int exp32[] = {0, 0, 0, 0, 0, 0, 0, 0};
3111 int exp42[] = {1, 0, 0, 0, 0, 0, 0, 0};
3112
3113 // | The last active lane in in3 is here.
3114 // v
3115 int exp03[] = {0, 0, 0, 0, 0, 0, 0, 0};
3116 int exp13[] = {0, 0, 0, 0, 0, 0, 0, 0};
3117 int exp23[] = {0, 1, 0, 0, 0, 0, 0, 0};
3118 int exp33[] = {0, 0, 0, 0, 0, 0, 0, 0};
3119 int exp43[] = {1, 0, 0, 0, 0, 0, 0, 0};
3120
3121 // | The last active lane in in4 is here.
3122 // v
3123 int exp04[] = {0, 0, 0, 0, 0, 0, 0, 0};
3124 int exp14[] = {0, 0, 0, 0, 0, 0, 0, 0};
3125 int exp24[] = {0, 0, 0, 0, 0, 0, 0, 0};
3126 int exp34[] = {0, 0, 0, 0, 0, 0, 0, 0};
3127 int exp44[] = {0, 0, 0, 0, 0, 0, 0, 0};
3128
Jacob Bramleye8289202019-07-31 11:25:23 +01003129 PnextHelper(config, kHRegSize, in0, in0, exp00);
3130 PnextHelper(config, kHRegSize, in1, in0, exp10);
3131 PnextHelper(config, kHRegSize, in2, in0, exp20);
3132 PnextHelper(config, kHRegSize, in3, in0, exp30);
3133 PnextHelper(config, kHRegSize, in4, in0, exp40);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003134
Jacob Bramleye8289202019-07-31 11:25:23 +01003135 PnextHelper(config, kHRegSize, in0, in1, exp01);
3136 PnextHelper(config, kHRegSize, in1, in1, exp11);
3137 PnextHelper(config, kHRegSize, in2, in1, exp21);
3138 PnextHelper(config, kHRegSize, in3, in1, exp31);
3139 PnextHelper(config, kHRegSize, in4, in1, exp41);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003140
Jacob Bramleye8289202019-07-31 11:25:23 +01003141 PnextHelper(config, kHRegSize, in0, in2, exp02);
3142 PnextHelper(config, kHRegSize, in1, in2, exp12);
3143 PnextHelper(config, kHRegSize, in2, in2, exp22);
3144 PnextHelper(config, kHRegSize, in3, in2, exp32);
3145 PnextHelper(config, kHRegSize, in4, in2, exp42);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003146
Jacob Bramleye8289202019-07-31 11:25:23 +01003147 PnextHelper(config, kHRegSize, in0, in3, exp03);
3148 PnextHelper(config, kHRegSize, in1, in3, exp13);
3149 PnextHelper(config, kHRegSize, in2, in3, exp23);
3150 PnextHelper(config, kHRegSize, in3, in3, exp33);
3151 PnextHelper(config, kHRegSize, in4, in3, exp43);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003152
Jacob Bramleye8289202019-07-31 11:25:23 +01003153 PnextHelper(config, kHRegSize, in0, in4, exp04);
3154 PnextHelper(config, kHRegSize, in1, in4, exp14);
3155 PnextHelper(config, kHRegSize, in2, in4, exp24);
3156 PnextHelper(config, kHRegSize, in3, in4, exp34);
3157 PnextHelper(config, kHRegSize, in4, in4, exp44);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003158}
3159
Jacob Bramleye8289202019-07-31 11:25:23 +01003160TEST_SVE(sve_pnext_s) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003161 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3162 // (to check propagation if we have a large VL), but few enough to make the
3163 // test easy to read.
3164 // For now, we just use kPRegMinSize so that the test works anywhere.
3165 int in0[] = {0xe, 0xc, 0x8, 0x0};
3166 int in1[] = {0x0, 0x2, 0x0, 0x1};
3167 int in2[] = {0x0, 0x1, 0xf, 0x0};
3168 int in3[] = {0xf, 0x0, 0x0, 0x0};
3169
3170 // Pnext activates the next element that is true in pg, after the last-active
3171 // element in pn. If all pn elements are false (as in in0), it starts looking
3172 // at element 0.
3173 //
3174 // As for other SVE instructions, elements are only considered to be active if
3175 // the _first_ bit in each field is one. Other bits are ignored.
3176
3177 // There are no active lanes in in0, so the result is simply the first active
3178 // lane from pg.
3179 int exp00[] = {0, 0, 0, 0};
3180 int exp10[] = {0, 0, 0, 1};
3181 int exp20[] = {0, 0, 1, 0};
3182 int exp30[] = {1, 0, 0, 0};
3183
3184 // | The last active lane in in1 is here.
3185 // v
3186 int exp01[] = {0, 0, 0, 0};
3187 int exp11[] = {0, 0, 0, 0};
3188 int exp21[] = {0, 0, 1, 0};
3189 int exp31[] = {1, 0, 0, 0};
3190
3191 // | The last active lane in in2 is here.
3192 // v
3193 int exp02[] = {0, 0, 0, 0};
3194 int exp12[] = {0, 0, 0, 0};
3195 int exp22[] = {0, 0, 0, 0};
3196 int exp32[] = {1, 0, 0, 0};
3197
3198 // | The last active lane in in3 is here.
3199 // v
3200 int exp03[] = {0, 0, 0, 0};
3201 int exp13[] = {0, 0, 0, 0};
3202 int exp23[] = {0, 0, 0, 0};
3203 int exp33[] = {0, 0, 0, 0};
3204
Jacob Bramleye8289202019-07-31 11:25:23 +01003205 PnextHelper(config, kSRegSize, in0, in0, exp00);
3206 PnextHelper(config, kSRegSize, in1, in0, exp10);
3207 PnextHelper(config, kSRegSize, in2, in0, exp20);
3208 PnextHelper(config, kSRegSize, in3, in0, exp30);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003209
Jacob Bramleye8289202019-07-31 11:25:23 +01003210 PnextHelper(config, kSRegSize, in0, in1, exp01);
3211 PnextHelper(config, kSRegSize, in1, in1, exp11);
3212 PnextHelper(config, kSRegSize, in2, in1, exp21);
3213 PnextHelper(config, kSRegSize, in3, in1, exp31);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003214
Jacob Bramleye8289202019-07-31 11:25:23 +01003215 PnextHelper(config, kSRegSize, in0, in2, exp02);
3216 PnextHelper(config, kSRegSize, in1, in2, exp12);
3217 PnextHelper(config, kSRegSize, in2, in2, exp22);
3218 PnextHelper(config, kSRegSize, in3, in2, exp32);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003219
Jacob Bramleye8289202019-07-31 11:25:23 +01003220 PnextHelper(config, kSRegSize, in0, in3, exp03);
3221 PnextHelper(config, kSRegSize, in1, in3, exp13);
3222 PnextHelper(config, kSRegSize, in2, in3, exp23);
3223 PnextHelper(config, kSRegSize, in3, in3, exp33);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003224}
3225
Jacob Bramleye8289202019-07-31 11:25:23 +01003226TEST_SVE(sve_pnext_d) {
Jacob Bramley0ce75842019-07-17 18:12:50 +01003227 // TODO: Once we have the infrastructure, provide more lanes than kPRegMinSize
3228 // (to check propagation if we have a large VL), but few enough to make the
3229 // test easy to read.
3230 // For now, we just use kPRegMinSize so that the test works anywhere.
3231 int in0[] = {0xfe, 0xf0};
3232 int in1[] = {0x00, 0x55};
3233 int in2[] = {0x33, 0xff};
3234
3235 // Pnext activates the next element that is true in pg, after the last-active
3236 // element in pn. If all pn elements are false (as in in0), it starts looking
3237 // at element 0.
3238 //
3239 // As for other SVE instructions, elements are only considered to be active if
3240 // the _first_ bit in each field is one. Other bits are ignored.
3241
3242 // There are no active lanes in in0, so the result is simply the first active
3243 // lane from pg.
3244 int exp00[] = {0, 0};
3245 int exp10[] = {0, 1};
3246 int exp20[] = {0, 1};
3247
3248 // | The last active lane in in1 is here.
3249 // v
3250 int exp01[] = {0, 0};
3251 int exp11[] = {0, 0};
3252 int exp21[] = {1, 0};
3253
3254 // | The last active lane in in2 is here.
3255 // v
3256 int exp02[] = {0, 0};
3257 int exp12[] = {0, 0};
3258 int exp22[] = {0, 0};
3259
Jacob Bramleye8289202019-07-31 11:25:23 +01003260 PnextHelper(config, kDRegSize, in0, in0, exp00);
3261 PnextHelper(config, kDRegSize, in1, in0, exp10);
3262 PnextHelper(config, kDRegSize, in2, in0, exp20);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003263
Jacob Bramleye8289202019-07-31 11:25:23 +01003264 PnextHelper(config, kDRegSize, in0, in1, exp01);
3265 PnextHelper(config, kDRegSize, in1, in1, exp11);
3266 PnextHelper(config, kDRegSize, in2, in1, exp21);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003267
Jacob Bramleye8289202019-07-31 11:25:23 +01003268 PnextHelper(config, kDRegSize, in0, in2, exp02);
3269 PnextHelper(config, kDRegSize, in1, in2, exp12);
3270 PnextHelper(config, kDRegSize, in2, in2, exp22);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003271}
3272
Jacob Bramleye8289202019-07-31 11:25:23 +01003273TEST_SVE(sve_pnext_alias) {
3274 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003275 START();
3276
3277 // Check that the Simulator behaves correctly when all arguments are aliased.
3278 int in_b[] = {0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0};
3279 int in_h[] = {0, 0, 0, 0, 1, 1, 0, 0};
3280 int in_s[] = {0, 1, 1, 0};
3281 int in_d[] = {1, 1};
3282
3283 Initialise(&masm, p0.VnB(), in_b);
3284 Initialise(&masm, p1.VnH(), in_h);
3285 Initialise(&masm, p2.VnS(), in_s);
3286 Initialise(&masm, p3.VnD(), in_d);
3287
3288 // Initialise NZCV to an impossible value, to check that we actually write it.
3289 __ Mov(x10, NZCVFlag);
3290
3291 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003292 __ Pnext(p0.VnB(), p0, p0.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003293 __ Mrs(x0, NZCV);
3294
3295 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003296 __ Pnext(p1.VnB(), p1, p1.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003297 __ Mrs(x1, NZCV);
3298
3299 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003300 __ Pnext(p2.VnB(), p2, p2.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003301 __ Mrs(x2, NZCV);
3302
3303 __ Msr(NZCV, x10);
Jacob Bramley7b5819c2020-06-17 17:29:16 +01003304 __ Pnext(p3.VnB(), p3, p3.VnB());
Jacob Bramley0ce75842019-07-17 18:12:50 +01003305 __ Mrs(x3, NZCV);
3306
3307 END();
3308
3309 if (CAN_RUN()) {
3310 RUN();
3311
3312 // Since pg.Is(pdn), there can be no active lanes in pg above the last
3313 // active lane in pdn, so the result should always be zero.
3314 ASSERT_EQUAL_SVE(0, p0.VnB());
3315 ASSERT_EQUAL_SVE(0, p1.VnH());
3316 ASSERT_EQUAL_SVE(0, p2.VnS());
3317 ASSERT_EQUAL_SVE(0, p3.VnD());
3318
3319 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x0);
3320 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x1);
3321 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x2);
3322 ASSERT_EQUAL_64(SVENoneFlag | SVENotLastFlag, x3);
3323 }
3324}
3325
Jacob Bramleye8289202019-07-31 11:25:23 +01003326static void PtrueHelper(Test* config,
3327 unsigned lane_size_in_bits,
Jacob Bramley0ce75842019-07-17 18:12:50 +01003328 FlagsUpdate s = LeaveFlags) {
Jacob Bramleye8289202019-07-31 11:25:23 +01003329 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003330 START();
3331
3332 PRegisterWithLaneSize p[kNumberOfPRegisters];
3333 for (unsigned i = 0; i < kNumberOfPRegisters; i++) {
3334 p[i] = PRegister(i).WithLaneSize(lane_size_in_bits);
3335 }
3336
3337 // Initialise NZCV to an impossible value, to check that we actually write it.
3338 StatusFlags nzcv_unmodified = NZCVFlag;
3339 __ Mov(x20, nzcv_unmodified);
3340
3341 // We don't have enough registers to conveniently test every pattern, so take
3342 // samples from each group.
3343 __ Msr(NZCV, x20);
3344 __ Ptrue(p[0], SVE_POW2, s);
3345 __ Mrs(x0, NZCV);
3346
3347 __ Msr(NZCV, x20);
3348 __ Ptrue(p[1], SVE_VL1, s);
3349 __ Mrs(x1, NZCV);
3350
3351 __ Msr(NZCV, x20);
3352 __ Ptrue(p[2], SVE_VL2, s);
3353 __ Mrs(x2, NZCV);
3354
3355 __ Msr(NZCV, x20);
3356 __ Ptrue(p[3], SVE_VL5, s);
3357 __ Mrs(x3, NZCV);
3358
3359 __ Msr(NZCV, x20);
3360 __ Ptrue(p[4], SVE_VL6, s);
3361 __ Mrs(x4, NZCV);
3362
3363 __ Msr(NZCV, x20);
3364 __ Ptrue(p[5], SVE_VL8, s);
3365 __ Mrs(x5, NZCV);
3366
3367 __ Msr(NZCV, x20);
3368 __ Ptrue(p[6], SVE_VL16, s);
3369 __ Mrs(x6, NZCV);
3370
3371 __ Msr(NZCV, x20);
3372 __ Ptrue(p[7], SVE_VL64, s);
3373 __ Mrs(x7, NZCV);
3374
3375 __ Msr(NZCV, x20);
3376 __ Ptrue(p[8], SVE_VL256, s);
3377 __ Mrs(x8, NZCV);
3378
3379 {
3380 // We have to use the Assembler to use values not defined by
3381 // SVEPredicateConstraint, so call `ptrues` directly..
3382 typedef void (
3383 MacroAssembler::*AssemblePtrueFn)(const PRegisterWithLaneSize& pd,
3384 int pattern);
Martyn Capewellae0af052022-01-20 17:56:28 +00003385 AssemblePtrueFn assemble = &MacroAssembler::ptrue;
3386 if (s == SetFlags) {
3387 assemble = &MacroAssembler::ptrues;
3388 }
Jacob Bramley0ce75842019-07-17 18:12:50 +01003389
3390 ExactAssemblyScope guard(&masm, 12 * kInstructionSize);
3391 __ msr(NZCV, x20);
3392 (masm.*assemble)(p[9], 0xe);
3393 __ mrs(x9, NZCV);
3394
3395 __ msr(NZCV, x20);
3396 (masm.*assemble)(p[10], 0x16);
3397 __ mrs(x10, NZCV);
3398
3399 __ msr(NZCV, x20);
3400 (masm.*assemble)(p[11], 0x1a);
3401 __ mrs(x11, NZCV);
3402
3403 __ msr(NZCV, x20);
3404 (masm.*assemble)(p[12], 0x1c);
3405 __ mrs(x12, NZCV);
3406 }
3407
3408 __ Msr(NZCV, x20);
3409 __ Ptrue(p[13], SVE_MUL4, s);
3410 __ Mrs(x13, NZCV);
3411
3412 __ Msr(NZCV, x20);
3413 __ Ptrue(p[14], SVE_MUL3, s);
3414 __ Mrs(x14, NZCV);
3415
3416 __ Msr(NZCV, x20);
3417 __ Ptrue(p[15], SVE_ALL, s);
3418 __ Mrs(x15, NZCV);
3419
3420 END();
3421
3422 if (CAN_RUN()) {
3423 RUN();
3424
3425 int all = core.GetSVELaneCount(lane_size_in_bits);
3426 int pow2 = 1 << HighestSetBitPosition(all);
3427 int mul4 = all - (all % 4);
3428 int mul3 = all - (all % 3);
3429
3430 // Check P register results.
3431 for (int i = 0; i < all; i++) {
3432 ASSERT_EQUAL_SVE_LANE(i < pow2, p[0], i);
3433 ASSERT_EQUAL_SVE_LANE((all >= 1) && (i < 1), p[1], i);
3434 ASSERT_EQUAL_SVE_LANE((all >= 2) && (i < 2), p[2], i);
3435 ASSERT_EQUAL_SVE_LANE((all >= 5) && (i < 5), p[3], i);
3436 ASSERT_EQUAL_SVE_LANE((all >= 6) && (i < 6), p[4], i);
3437 ASSERT_EQUAL_SVE_LANE((all >= 8) && (i < 8), p[5], i);
3438 ASSERT_EQUAL_SVE_LANE((all >= 16) && (i < 16), p[6], i);
3439 ASSERT_EQUAL_SVE_LANE((all >= 64) && (i < 64), p[7], i);
3440 ASSERT_EQUAL_SVE_LANE((all >= 256) && (i < 256), p[8], i);
3441 ASSERT_EQUAL_SVE_LANE(false, p[9], i);
3442 ASSERT_EQUAL_SVE_LANE(false, p[10], i);
3443 ASSERT_EQUAL_SVE_LANE(false, p[11], i);
3444 ASSERT_EQUAL_SVE_LANE(false, p[12], i);
3445 ASSERT_EQUAL_SVE_LANE(i < mul4, p[13], i);
3446 ASSERT_EQUAL_SVE_LANE(i < mul3, p[14], i);
3447 ASSERT_EQUAL_SVE_LANE(true, p[15], i);
3448 }
3449
3450 // Check NZCV results.
3451 if (s == LeaveFlags) {
3452 // No flags should have been updated.
3453 for (int i = 0; i <= 15; i++) {
3454 ASSERT_EQUAL_64(nzcv_unmodified, XRegister(i));
3455 }
3456 } else {
3457 StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3458 StatusFlags nonzero = SVEFirstFlag;
3459
3460 // POW2
3461 ASSERT_EQUAL_64(nonzero, x0);
3462 // VL*
3463 ASSERT_EQUAL_64((all >= 1) ? nonzero : zero, x1);
3464 ASSERT_EQUAL_64((all >= 2) ? nonzero : zero, x2);
3465 ASSERT_EQUAL_64((all >= 5) ? nonzero : zero, x3);
3466 ASSERT_EQUAL_64((all >= 6) ? nonzero : zero, x4);
3467 ASSERT_EQUAL_64((all >= 8) ? nonzero : zero, x5);
3468 ASSERT_EQUAL_64((all >= 16) ? nonzero : zero, x6);
3469 ASSERT_EQUAL_64((all >= 64) ? nonzero : zero, x7);
3470 ASSERT_EQUAL_64((all >= 256) ? nonzero : zero, x8);
3471 // #uimm5
3472 ASSERT_EQUAL_64(zero, x9);
3473 ASSERT_EQUAL_64(zero, x10);
3474 ASSERT_EQUAL_64(zero, x11);
3475 ASSERT_EQUAL_64(zero, x12);
3476 // MUL*
3477 ASSERT_EQUAL_64((all >= 4) ? nonzero : zero, x13);
3478 ASSERT_EQUAL_64((all >= 3) ? nonzero : zero, x14);
3479 // ALL
3480 ASSERT_EQUAL_64(nonzero, x15);
3481 }
3482 }
3483}
3484
Jacob Bramleye8289202019-07-31 11:25:23 +01003485TEST_SVE(sve_ptrue_b) { PtrueHelper(config, kBRegSize, LeaveFlags); }
3486TEST_SVE(sve_ptrue_h) { PtrueHelper(config, kHRegSize, LeaveFlags); }
3487TEST_SVE(sve_ptrue_s) { PtrueHelper(config, kSRegSize, LeaveFlags); }
3488TEST_SVE(sve_ptrue_d) { PtrueHelper(config, kDRegSize, LeaveFlags); }
Jacob Bramley0ce75842019-07-17 18:12:50 +01003489
Jacob Bramleye8289202019-07-31 11:25:23 +01003490TEST_SVE(sve_ptrues_b) { PtrueHelper(config, kBRegSize, SetFlags); }
3491TEST_SVE(sve_ptrues_h) { PtrueHelper(config, kHRegSize, SetFlags); }
3492TEST_SVE(sve_ptrues_s) { PtrueHelper(config, kSRegSize, SetFlags); }
3493TEST_SVE(sve_ptrues_d) { PtrueHelper(config, kDRegSize, SetFlags); }
Jacob Bramley0ce75842019-07-17 18:12:50 +01003494
Jacob Bramleye8289202019-07-31 11:25:23 +01003495TEST_SVE(sve_pfalse) {
3496 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003497 START();
3498
3499 // Initialise non-zero inputs.
3500 __ Ptrue(p0.VnB());
3501 __ Ptrue(p1.VnH());
3502 __ Ptrue(p2.VnS());
3503 __ Ptrue(p3.VnD());
3504
3505 // The instruction only supports B-sized lanes, but the lane size has no
3506 // logical effect, so the MacroAssembler accepts anything.
3507 __ Pfalse(p0.VnB());
3508 __ Pfalse(p1.VnH());
3509 __ Pfalse(p2.VnS());
3510 __ Pfalse(p3.VnD());
3511
3512 END();
3513
3514 if (CAN_RUN()) {
3515 RUN();
3516
3517 ASSERT_EQUAL_SVE(0, p0.VnB());
3518 ASSERT_EQUAL_SVE(0, p1.VnB());
3519 ASSERT_EQUAL_SVE(0, p2.VnB());
3520 ASSERT_EQUAL_SVE(0, p3.VnB());
3521 }
3522}
3523
Jacob Bramleye8289202019-07-31 11:25:23 +01003524TEST_SVE(sve_ptest) {
3525 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley0ce75842019-07-17 18:12:50 +01003526 START();
3527
3528 // Initialise NZCV to a known (impossible) value.
3529 StatusFlags nzcv_unmodified = NZCVFlag;
3530 __ Mov(x0, nzcv_unmodified);
3531 __ Msr(NZCV, x0);
3532
3533 // Construct some test inputs.
3534 int in2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0};
3535 int in3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0};
3536 int in4[] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0};
3537 __ Pfalse(p0.VnB());
3538 __ Ptrue(p1.VnB());
3539 Initialise(&masm, p2.VnB(), in2);
3540 Initialise(&masm, p3.VnB(), in3);
3541 Initialise(&masm, p4.VnB(), in4);
3542
3543 // All-inactive pg.
3544 __ Ptest(p0, p0.VnB());
3545 __ Mrs(x0, NZCV);
3546 __ Ptest(p0, p1.VnB());
3547 __ Mrs(x1, NZCV);
3548 __ Ptest(p0, p2.VnB());
3549 __ Mrs(x2, NZCV);
3550 __ Ptest(p0, p3.VnB());
3551 __ Mrs(x3, NZCV);
3552 __ Ptest(p0, p4.VnB());
3553 __ Mrs(x4, NZCV);
3554
3555 // All-active pg.
3556 __ Ptest(p1, p0.VnB());
3557 __ Mrs(x5, NZCV);
3558 __ Ptest(p1, p1.VnB());
3559 __ Mrs(x6, NZCV);
3560 __ Ptest(p1, p2.VnB());
3561 __ Mrs(x7, NZCV);
3562 __ Ptest(p1, p3.VnB());
3563 __ Mrs(x8, NZCV);
3564 __ Ptest(p1, p4.VnB());
3565 __ Mrs(x9, NZCV);
3566
3567 // Combinations of other inputs.
3568 __ Ptest(p2, p2.VnB());
3569 __ Mrs(x20, NZCV);
3570 __ Ptest(p2, p3.VnB());
3571 __ Mrs(x21, NZCV);
3572 __ Ptest(p2, p4.VnB());
3573 __ Mrs(x22, NZCV);
3574 __ Ptest(p3, p2.VnB());
3575 __ Mrs(x23, NZCV);
3576 __ Ptest(p3, p3.VnB());
3577 __ Mrs(x24, NZCV);
3578 __ Ptest(p3, p4.VnB());
3579 __ Mrs(x25, NZCV);
3580 __ Ptest(p4, p2.VnB());
3581 __ Mrs(x26, NZCV);
3582 __ Ptest(p4, p3.VnB());
3583 __ Mrs(x27, NZCV);
3584 __ Ptest(p4, p4.VnB());
3585 __ Mrs(x28, NZCV);
3586
3587 END();
3588
3589 if (CAN_RUN()) {
3590 RUN();
3591
3592 StatusFlags zero = static_cast<StatusFlags>(SVENoneFlag | SVENotLastFlag);
3593
3594 // If pg is all inactive, the value of pn is irrelevant.
3595 ASSERT_EQUAL_64(zero, x0);
3596 ASSERT_EQUAL_64(zero, x1);
3597 ASSERT_EQUAL_64(zero, x2);
3598 ASSERT_EQUAL_64(zero, x3);
3599 ASSERT_EQUAL_64(zero, x4);
3600
3601 // All-active pg.
3602 ASSERT_EQUAL_64(zero, x5); // All-inactive pn.
3603 ASSERT_EQUAL_64(SVEFirstFlag, x6); // All-active pn.
3604 // Other pn inputs are non-zero, but the first and last lanes are inactive.
3605 ASSERT_EQUAL_64(SVENotLastFlag, x7);
3606 ASSERT_EQUAL_64(SVENotLastFlag, x8);
3607 ASSERT_EQUAL_64(SVENotLastFlag, x9);
3608
3609 // Other inputs.
3610 ASSERT_EQUAL_64(SVEFirstFlag, x20); // pg: in2, pn: in2
3611 ASSERT_EQUAL_64(NoFlag, x21); // pg: in2, pn: in3
3612 ASSERT_EQUAL_64(zero, x22); // pg: in2, pn: in4
3613 ASSERT_EQUAL_64(static_cast<StatusFlags>(SVEFirstFlag | SVENotLastFlag),
3614 x23); // pg: in3, pn: in2
3615 ASSERT_EQUAL_64(SVEFirstFlag, x24); // pg: in3, pn: in3
3616 ASSERT_EQUAL_64(zero, x25); // pg: in3, pn: in4
3617 ASSERT_EQUAL_64(zero, x26); // pg: in4, pn: in2
3618 ASSERT_EQUAL_64(zero, x27); // pg: in4, pn: in3
3619 ASSERT_EQUAL_64(SVEFirstFlag, x28); // pg: in4, pn: in4
3620 }
3621}
3622
Jacob Bramleye8289202019-07-31 11:25:23 +01003623TEST_SVE(sve_cntp) {
3624 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramleyd961a0c2019-07-17 10:53:45 +01003625 START();
3626
3627 // There are {7, 5, 2, 1} active {B, H, S, D} lanes.
3628 int p0_inputs[] = {0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
3629 Initialise(&masm, p0.VnB(), p0_inputs);
3630
3631 // With an all-true predicate, these instructions measure the vector length.
3632 __ Ptrue(p10.VnB());
3633 __ Ptrue(p11.VnH());
3634 __ Ptrue(p12.VnS());
3635 __ Ptrue(p13.VnD());
3636
3637 // `ptrue p10.b` provides an all-active pg.
3638 __ Cntp(x10, p10, p10.VnB());
3639 __ Cntp(x11, p10, p11.VnH());
3640 __ Cntp(x12, p10, p12.VnS());
3641 __ Cntp(x13, p10, p13.VnD());
3642
3643 // Check that the predicate mask is applied properly.
3644 __ Cntp(x14, p10, p10.VnB());
3645 __ Cntp(x15, p11, p10.VnB());
3646 __ Cntp(x16, p12, p10.VnB());
3647 __ Cntp(x17, p13, p10.VnB());
3648
3649 // Check other patterns (including some ignored bits).
3650 __ Cntp(x0, p10, p0.VnB());
3651 __ Cntp(x1, p10, p0.VnH());
3652 __ Cntp(x2, p10, p0.VnS());
3653 __ Cntp(x3, p10, p0.VnD());
3654 __ Cntp(x4, p0, p10.VnB());
3655 __ Cntp(x5, p0, p10.VnH());
3656 __ Cntp(x6, p0, p10.VnS());
3657 __ Cntp(x7, p0, p10.VnD());
3658
3659 END();
3660
3661 if (CAN_RUN()) {
3662 RUN();
3663
3664 int vl_b = core.GetSVELaneCount(kBRegSize);
3665 int vl_h = core.GetSVELaneCount(kHRegSize);
3666 int vl_s = core.GetSVELaneCount(kSRegSize);
3667 int vl_d = core.GetSVELaneCount(kDRegSize);
3668
3669 // Check all-active predicates in various combinations.
3670 ASSERT_EQUAL_64(vl_b, x10);
3671 ASSERT_EQUAL_64(vl_h, x11);
3672 ASSERT_EQUAL_64(vl_s, x12);
3673 ASSERT_EQUAL_64(vl_d, x13);
3674
3675 ASSERT_EQUAL_64(vl_b, x14);
3676 ASSERT_EQUAL_64(vl_h, x15);
3677 ASSERT_EQUAL_64(vl_s, x16);
3678 ASSERT_EQUAL_64(vl_d, x17);
3679
3680 // Check that irrelevant bits are properly ignored.
3681 ASSERT_EQUAL_64(7, x0);
3682 ASSERT_EQUAL_64(5, x1);
3683 ASSERT_EQUAL_64(2, x2);
3684 ASSERT_EQUAL_64(1, x3);
3685
3686 ASSERT_EQUAL_64(7, x4);
3687 ASSERT_EQUAL_64(5, x5);
3688 ASSERT_EQUAL_64(2, x6);
3689 ASSERT_EQUAL_64(1, x7);
3690 }
3691}
3692
Martyn Capewell74f84f62019-10-30 15:30:44 +00003693typedef void (MacroAssembler::*CntFn)(const Register& dst,
3694 int pattern,
3695 int multiplier);
3696
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003697template <typename T>
3698void GenerateCntSequence(MacroAssembler* masm,
3699 CntFn cnt,
3700 T acc_value,
3701 int multiplier) {
3702 // Initialise accumulators.
3703 masm->Mov(x0, acc_value);
3704 masm->Mov(x1, acc_value);
3705 masm->Mov(x2, acc_value);
3706 masm->Mov(x3, acc_value);
3707 masm->Mov(x4, acc_value);
3708 masm->Mov(x5, acc_value);
3709 masm->Mov(x6, acc_value);
3710 masm->Mov(x7, acc_value);
3711 masm->Mov(x8, acc_value);
3712 masm->Mov(x9, acc_value);
3713 masm->Mov(x10, acc_value);
3714 masm->Mov(x11, acc_value);
3715 masm->Mov(x12, acc_value);
3716 masm->Mov(x13, acc_value);
3717 masm->Mov(x14, acc_value);
3718 masm->Mov(x15, acc_value);
3719 masm->Mov(x18, acc_value);
3720 masm->Mov(x19, acc_value);
3721 masm->Mov(x20, acc_value);
3722 masm->Mov(x21, acc_value);
3723
3724 (masm->*cnt)(Register(0, sizeof(T) * kBitsPerByte), SVE_POW2, multiplier);
3725 (masm->*cnt)(Register(1, sizeof(T) * kBitsPerByte), SVE_VL1, multiplier);
3726 (masm->*cnt)(Register(2, sizeof(T) * kBitsPerByte), SVE_VL2, multiplier);
3727 (masm->*cnt)(Register(3, sizeof(T) * kBitsPerByte), SVE_VL3, multiplier);
3728 (masm->*cnt)(Register(4, sizeof(T) * kBitsPerByte), SVE_VL4, multiplier);
3729 (masm->*cnt)(Register(5, sizeof(T) * kBitsPerByte), SVE_VL5, multiplier);
3730 (masm->*cnt)(Register(6, sizeof(T) * kBitsPerByte), SVE_VL6, multiplier);
3731 (masm->*cnt)(Register(7, sizeof(T) * kBitsPerByte), SVE_VL7, multiplier);
3732 (masm->*cnt)(Register(8, sizeof(T) * kBitsPerByte), SVE_VL8, multiplier);
3733 (masm->*cnt)(Register(9, sizeof(T) * kBitsPerByte), SVE_VL16, multiplier);
3734 (masm->*cnt)(Register(10, sizeof(T) * kBitsPerByte), SVE_VL32, multiplier);
3735 (masm->*cnt)(Register(11, sizeof(T) * kBitsPerByte), SVE_VL64, multiplier);
3736 (masm->*cnt)(Register(12, sizeof(T) * kBitsPerByte), SVE_VL128, multiplier);
3737 (masm->*cnt)(Register(13, sizeof(T) * kBitsPerByte), SVE_VL256, multiplier);
3738 (masm->*cnt)(Register(14, sizeof(T) * kBitsPerByte), 16, multiplier);
3739 (masm->*cnt)(Register(15, sizeof(T) * kBitsPerByte), 23, multiplier);
3740 (masm->*cnt)(Register(18, sizeof(T) * kBitsPerByte), 28, multiplier);
3741 (masm->*cnt)(Register(19, sizeof(T) * kBitsPerByte), SVE_MUL4, multiplier);
3742 (masm->*cnt)(Register(20, sizeof(T) * kBitsPerByte), SVE_MUL3, multiplier);
3743 (masm->*cnt)(Register(21, sizeof(T) * kBitsPerByte), SVE_ALL, multiplier);
3744}
3745
3746int FixedVL(int fixed, int length) {
3747 VIXL_ASSERT(((fixed >= 1) && (fixed <= 8)) || (fixed == 16) ||
3748 (fixed == 32) || (fixed == 64) || (fixed == 128) ||
3749 (fixed = 256));
3750 return (length >= fixed) ? fixed : 0;
3751}
3752
Martyn Capewell74f84f62019-10-30 15:30:44 +00003753static void CntHelper(Test* config,
3754 CntFn cnt,
3755 int multiplier,
Martyn Capewell579c92d2019-10-30 17:48:52 +00003756 int lane_size_in_bits,
3757 int64_t acc_value = 0,
3758 bool is_increment = true) {
Martyn Capewell74f84f62019-10-30 15:30:44 +00003759 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3760 START();
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003761 GenerateCntSequence(&masm, cnt, acc_value, multiplier);
Martyn Capewell74f84f62019-10-30 15:30:44 +00003762 END();
3763
3764 if (CAN_RUN()) {
3765 RUN();
3766
3767 int all = core.GetSVELaneCount(lane_size_in_bits);
3768 int pow2 = 1 << HighestSetBitPosition(all);
3769 int mul4 = all - (all % 4);
3770 int mul3 = all - (all % 3);
3771
Martyn Capewell579c92d2019-10-30 17:48:52 +00003772 multiplier = is_increment ? multiplier : -multiplier;
3773
3774 ASSERT_EQUAL_64(acc_value + (multiplier * pow2), x0);
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003775 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(1, all)), x1);
3776 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(2, all)), x2);
3777 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(3, all)), x3);
3778 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(4, all)), x4);
3779 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(5, all)), x5);
3780 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(6, all)), x6);
3781 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(7, all)), x7);
3782 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(8, all)), x8);
3783 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(16, all)), x9);
3784 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(32, all)), x10);
3785 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(64, all)), x11);
3786 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(128, all)), x12);
3787 ASSERT_EQUAL_64(acc_value + (multiplier * FixedVL(256, all)), x13);
Martyn Capewell579c92d2019-10-30 17:48:52 +00003788 ASSERT_EQUAL_64(acc_value, x14);
3789 ASSERT_EQUAL_64(acc_value, x15);
3790 ASSERT_EQUAL_64(acc_value, x18);
3791 ASSERT_EQUAL_64(acc_value + (multiplier * mul4), x19);
3792 ASSERT_EQUAL_64(acc_value + (multiplier * mul3), x20);
3793 ASSERT_EQUAL_64(acc_value + (multiplier * all), x21);
Martyn Capewell74f84f62019-10-30 15:30:44 +00003794 }
3795}
3796
Martyn Capewell579c92d2019-10-30 17:48:52 +00003797static void IncHelper(Test* config,
3798 CntFn cnt,
3799 int multiplier,
3800 int lane_size_in_bits,
3801 int64_t acc_value) {
3802 CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
3803}
3804
3805static void DecHelper(Test* config,
3806 CntFn cnt,
3807 int multiplier,
3808 int lane_size_in_bits,
3809 int64_t acc_value) {
3810 CntHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
3811}
3812
Martyn Capewell74f84f62019-10-30 15:30:44 +00003813TEST_SVE(sve_cntb) {
3814 CntHelper(config, &MacroAssembler::Cntb, 1, kBRegSize);
3815 CntHelper(config, &MacroAssembler::Cntb, 2, kBRegSize);
3816 CntHelper(config, &MacroAssembler::Cntb, 15, kBRegSize);
3817 CntHelper(config, &MacroAssembler::Cntb, 16, kBRegSize);
3818}
3819
3820TEST_SVE(sve_cnth) {
3821 CntHelper(config, &MacroAssembler::Cnth, 1, kHRegSize);
3822 CntHelper(config, &MacroAssembler::Cnth, 2, kHRegSize);
3823 CntHelper(config, &MacroAssembler::Cnth, 15, kHRegSize);
3824 CntHelper(config, &MacroAssembler::Cnth, 16, kHRegSize);
3825}
3826
3827TEST_SVE(sve_cntw) {
3828 CntHelper(config, &MacroAssembler::Cntw, 1, kWRegSize);
3829 CntHelper(config, &MacroAssembler::Cntw, 2, kWRegSize);
3830 CntHelper(config, &MacroAssembler::Cntw, 15, kWRegSize);
3831 CntHelper(config, &MacroAssembler::Cntw, 16, kWRegSize);
3832}
3833
3834TEST_SVE(sve_cntd) {
3835 CntHelper(config, &MacroAssembler::Cntd, 1, kDRegSize);
3836 CntHelper(config, &MacroAssembler::Cntd, 2, kDRegSize);
3837 CntHelper(config, &MacroAssembler::Cntd, 15, kDRegSize);
3838 CntHelper(config, &MacroAssembler::Cntd, 16, kDRegSize);
3839}
3840
Martyn Capewell579c92d2019-10-30 17:48:52 +00003841TEST_SVE(sve_decb) {
3842 DecHelper(config, &MacroAssembler::Decb, 1, kBRegSize, 42);
3843 DecHelper(config, &MacroAssembler::Decb, 2, kBRegSize, -1);
3844 DecHelper(config, &MacroAssembler::Decb, 15, kBRegSize, INT64_MIN);
3845 DecHelper(config, &MacroAssembler::Decb, 16, kBRegSize, -42);
3846}
3847
3848TEST_SVE(sve_dech) {
3849 DecHelper(config, &MacroAssembler::Dech, 1, kHRegSize, 42);
3850 DecHelper(config, &MacroAssembler::Dech, 2, kHRegSize, -1);
3851 DecHelper(config, &MacroAssembler::Dech, 15, kHRegSize, INT64_MIN);
3852 DecHelper(config, &MacroAssembler::Dech, 16, kHRegSize, -42);
3853}
3854
3855TEST_SVE(sve_decw) {
3856 DecHelper(config, &MacroAssembler::Decw, 1, kWRegSize, 42);
3857 DecHelper(config, &MacroAssembler::Decw, 2, kWRegSize, -1);
3858 DecHelper(config, &MacroAssembler::Decw, 15, kWRegSize, INT64_MIN);
3859 DecHelper(config, &MacroAssembler::Decw, 16, kWRegSize, -42);
3860}
3861
3862TEST_SVE(sve_decd) {
3863 DecHelper(config, &MacroAssembler::Decd, 1, kDRegSize, 42);
3864 DecHelper(config, &MacroAssembler::Decd, 2, kDRegSize, -1);
3865 DecHelper(config, &MacroAssembler::Decd, 15, kDRegSize, INT64_MIN);
3866 DecHelper(config, &MacroAssembler::Decd, 16, kDRegSize, -42);
3867}
3868
3869TEST_SVE(sve_incb) {
3870 IncHelper(config, &MacroAssembler::Incb, 1, kBRegSize, 42);
3871 IncHelper(config, &MacroAssembler::Incb, 2, kBRegSize, -1);
3872 IncHelper(config, &MacroAssembler::Incb, 15, kBRegSize, INT64_MAX);
3873 IncHelper(config, &MacroAssembler::Incb, 16, kBRegSize, -42);
3874}
3875
3876TEST_SVE(sve_inch) {
3877 IncHelper(config, &MacroAssembler::Inch, 1, kHRegSize, 42);
3878 IncHelper(config, &MacroAssembler::Inch, 2, kHRegSize, -1);
3879 IncHelper(config, &MacroAssembler::Inch, 15, kHRegSize, INT64_MAX);
3880 IncHelper(config, &MacroAssembler::Inch, 16, kHRegSize, -42);
3881}
3882
3883TEST_SVE(sve_incw) {
3884 IncHelper(config, &MacroAssembler::Incw, 1, kWRegSize, 42);
3885 IncHelper(config, &MacroAssembler::Incw, 2, kWRegSize, -1);
3886 IncHelper(config, &MacroAssembler::Incw, 15, kWRegSize, INT64_MAX);
3887 IncHelper(config, &MacroAssembler::Incw, 16, kWRegSize, -42);
3888}
3889
3890TEST_SVE(sve_incd) {
3891 IncHelper(config, &MacroAssembler::Incd, 1, kDRegSize, 42);
3892 IncHelper(config, &MacroAssembler::Incd, 2, kDRegSize, -1);
3893 IncHelper(config, &MacroAssembler::Incd, 15, kDRegSize, INT64_MAX);
3894 IncHelper(config, &MacroAssembler::Incd, 16, kDRegSize, -42);
3895}
3896
Martyn Capewell91d5ba32019-11-01 18:11:23 +00003897template <typename T>
3898static T QAdd(T x, int y) {
3899 VIXL_ASSERT(y > INT_MIN);
3900 T result;
3901 T min = std::numeric_limits<T>::min();
3902 T max = std::numeric_limits<T>::max();
3903 if ((x >= 0) && (y >= 0)) {
3904 // For positive a and b, saturate at max.
3905 result = (max - x) < static_cast<T>(y) ? max : x + y;
3906 } else if ((y < 0) && ((x < 0) || (min == 0))) {
3907 // For negative b, where either a negative or T unsigned.
3908 result = (x - min) < static_cast<T>(-y) ? min : x + y;
3909 } else {
3910 result = x + y;
3911 }
3912 return result;
3913}
3914
3915template <typename T>
3916static void QIncDecHelper(Test* config,
3917 CntFn cnt,
3918 int multiplier,
3919 int lane_size_in_bits,
3920 T acc_value,
3921 bool is_increment) {
3922 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
3923 START();
3924 GenerateCntSequence(&masm, cnt, acc_value, multiplier);
3925 END();
3926
3927 if (CAN_RUN()) {
3928 RUN();
3929
3930 int all = core.GetSVELaneCount(lane_size_in_bits);
3931 int pow2 = 1 << HighestSetBitPosition(all);
3932 int mul4 = all - (all % 4);
3933 int mul3 = all - (all % 3);
3934
3935 multiplier = is_increment ? multiplier : -multiplier;
3936
3937 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
3938 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
3939 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
3940 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
3941 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
3942 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
3943 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
3944 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
3945 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
3946 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
3947 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
3948 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
3949 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
3950 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
3951 ASSERT_EQUAL_64(acc_value, x14);
3952 ASSERT_EQUAL_64(acc_value, x15);
3953 ASSERT_EQUAL_64(acc_value, x18);
3954 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
3955 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
3956 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
3957 }
3958}
3959
3960template <typename T>
3961static void QIncHelper(Test* config,
3962 CntFn cnt,
3963 int multiplier,
3964 int lane_size_in_bits,
3965 T acc_value) {
3966 QIncDecHelper<T>(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
3967}
3968
3969template <typename T>
3970static void QDecHelper(Test* config,
3971 CntFn cnt,
3972 int multiplier,
3973 int lane_size_in_bits,
3974 T acc_value) {
3975 QIncDecHelper<T>(config,
3976 cnt,
3977 multiplier,
3978 lane_size_in_bits,
3979 acc_value,
3980 false);
3981}
3982
3983TEST_SVE(sve_sqdecb) {
3984 int64_t bigneg = INT64_MIN + 42;
3985 int64_t bigpos = INT64_MAX - 42;
3986 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
3987 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 2, kBRegSize, bigneg);
3988 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
3989 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecb, 16, kBRegSize, bigpos);
3990}
3991
3992TEST_SVE(sve_sqdech) {
3993 int64_t bigneg = INT64_MIN + 42;
3994 int64_t bigpos = INT64_MAX - 42;
3995 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
3996 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 2, kHRegSize, bigneg);
3997 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
3998 QDecHelper<int64_t>(config, &MacroAssembler::Sqdech, 16, kHRegSize, bigpos);
3999}
4000
4001TEST_SVE(sve_sqdecw) {
4002 int64_t bigneg = INT64_MIN + 42;
4003 int64_t bigpos = INT64_MAX - 42;
4004 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4005 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 2, kWRegSize, bigneg);
4006 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4007 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecw, 16, kWRegSize, bigpos);
4008}
4009
4010TEST_SVE(sve_sqdecd) {
4011 int64_t bigneg = INT64_MIN + 42;
4012 int64_t bigpos = INT64_MAX - 42;
4013 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4014 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 2, kDRegSize, bigneg);
4015 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4016 QDecHelper<int64_t>(config, &MacroAssembler::Sqdecd, 16, kDRegSize, bigpos);
4017}
4018
4019TEST_SVE(sve_sqincb) {
4020 int64_t bigneg = INT64_MIN + 42;
4021 int64_t bigpos = INT64_MAX - 42;
4022 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4023 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 2, kBRegSize, bigneg);
4024 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4025 QIncHelper<int64_t>(config, &MacroAssembler::Sqincb, 16, kBRegSize, bigpos);
4026}
4027
4028TEST_SVE(sve_sqinch) {
4029 int64_t bigneg = INT64_MIN + 42;
4030 int64_t bigpos = INT64_MAX - 42;
4031 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4032 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 2, kHRegSize, bigneg);
4033 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4034 QIncHelper<int64_t>(config, &MacroAssembler::Sqinch, 16, kHRegSize, bigpos);
4035}
4036
4037TEST_SVE(sve_sqincw) {
4038 int64_t bigneg = INT64_MIN + 42;
4039 int64_t bigpos = INT64_MAX - 42;
4040 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4041 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 2, kWRegSize, bigneg);
4042 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4043 QIncHelper<int64_t>(config, &MacroAssembler::Sqincw, 16, kWRegSize, bigpos);
4044}
4045
4046TEST_SVE(sve_sqincd) {
4047 int64_t bigneg = INT64_MIN + 42;
4048 int64_t bigpos = INT64_MAX - 42;
4049 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4050 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 2, kDRegSize, bigneg);
4051 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4052 QIncHelper<int64_t>(config, &MacroAssembler::Sqincd, 16, kDRegSize, bigpos);
4053}
4054
4055TEST_SVE(sve_uqdecb) {
4056 int32_t big32 = UINT32_MAX - 42;
4057 int64_t big64 = UINT64_MAX - 42;
4058 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4059 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4060 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4061 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big32);
4062 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 1, kBRegSize, 1);
4063 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 2, kBRegSize, 42);
4064 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 15, kBRegSize, 999);
4065 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecb, 16, kBRegSize, big64);
4066}
4067
4068TEST_SVE(sve_uqdech) {
4069 int32_t big32 = UINT32_MAX - 42;
4070 int64_t big64 = UINT64_MAX - 42;
4071 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4072 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4073 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4074 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big32);
4075 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 1, kHRegSize, 1);
4076 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 2, kHRegSize, 42);
4077 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 15, kHRegSize, 999);
4078 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdech, 16, kHRegSize, big64);
4079}
4080
4081TEST_SVE(sve_uqdecw) {
4082 int32_t big32 = UINT32_MAX - 42;
4083 int64_t big64 = UINT64_MAX - 42;
4084 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4085 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4086 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4087 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big32);
4088 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 1, kWRegSize, 1);
4089 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 2, kWRegSize, 42);
4090 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 15, kWRegSize, 999);
4091 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecw, 16, kWRegSize, big64);
4092}
4093
4094TEST_SVE(sve_uqdecd) {
4095 int32_t big32 = UINT32_MAX - 42;
4096 int64_t big64 = UINT64_MAX - 42;
4097 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4098 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4099 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4100 QDecHelper<uint32_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big32);
4101 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 1, kDRegSize, 1);
4102 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 2, kDRegSize, 42);
4103 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 15, kDRegSize, 999);
4104 QDecHelper<uint64_t>(config, &MacroAssembler::Uqdecd, 16, kDRegSize, big64);
4105}
4106
4107TEST_SVE(sve_uqincb) {
4108 int32_t big32 = UINT32_MAX - 42;
4109 int64_t big64 = UINT64_MAX - 42;
4110 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4111 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4112 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4113 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big32);
4114 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 1, kBRegSize, 1);
4115 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 2, kBRegSize, 42);
4116 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 15, kBRegSize, 999);
4117 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincb, 16, kBRegSize, big64);
4118}
4119
4120TEST_SVE(sve_uqinch) {
4121 int32_t big32 = UINT32_MAX - 42;
4122 int64_t big64 = UINT64_MAX - 42;
4123 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4124 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4125 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4126 QIncHelper<uint32_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big32);
4127 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 1, kHRegSize, 1);
4128 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 2, kHRegSize, 42);
4129 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 15, kHRegSize, 999);
4130 QIncHelper<uint64_t>(config, &MacroAssembler::Uqinch, 16, kHRegSize, big64);
4131}
4132
4133TEST_SVE(sve_uqincw) {
4134 int32_t big32 = UINT32_MAX - 42;
4135 int64_t big64 = UINT64_MAX - 42;
4136 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4137 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4138 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4139 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big32);
4140 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 1, kWRegSize, 1);
4141 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 2, kWRegSize, 42);
4142 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 15, kWRegSize, 999);
4143 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincw, 16, kWRegSize, big64);
4144}
4145
4146TEST_SVE(sve_uqincd) {
4147 int32_t big32 = UINT32_MAX - 42;
4148 int64_t big64 = UINT64_MAX - 42;
4149 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4150 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4151 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4152 QIncHelper<uint32_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big32);
4153 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 1, kDRegSize, 1);
4154 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 2, kDRegSize, 42);
4155 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 15, kDRegSize, 999);
4156 QIncHelper<uint64_t>(config, &MacroAssembler::Uqincd, 16, kDRegSize, big64);
4157}
4158
4159typedef void (MacroAssembler::*QIncDecXWFn)(const Register& dst,
4160 const Register& src,
4161 int pattern,
4162 int multiplier);
4163
4164static void QIncDecXWHelper(Test* config,
4165 QIncDecXWFn cnt,
4166 int multiplier,
4167 int lane_size_in_bits,
4168 int32_t acc_value,
4169 bool is_increment) {
4170 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4171 START();
4172
4173 // Initialise accumulators.
4174 __ Mov(x0, acc_value);
4175 __ Mov(x1, acc_value);
4176 __ Mov(x2, acc_value);
4177 __ Mov(x3, acc_value);
4178 __ Mov(x4, acc_value);
4179 __ Mov(x5, acc_value);
4180 __ Mov(x6, acc_value);
4181 __ Mov(x7, acc_value);
4182 __ Mov(x8, acc_value);
4183 __ Mov(x9, acc_value);
4184 __ Mov(x10, acc_value);
4185 __ Mov(x11, acc_value);
4186 __ Mov(x12, acc_value);
4187 __ Mov(x13, acc_value);
4188 __ Mov(x14, acc_value);
4189 __ Mov(x15, acc_value);
4190 __ Mov(x18, acc_value);
4191 __ Mov(x19, acc_value);
4192 __ Mov(x20, acc_value);
4193 __ Mov(x21, acc_value);
4194
4195 (masm.*cnt)(x0, w0, SVE_POW2, multiplier);
4196 (masm.*cnt)(x1, w1, SVE_VL1, multiplier);
4197 (masm.*cnt)(x2, w2, SVE_VL2, multiplier);
4198 (masm.*cnt)(x3, w3, SVE_VL3, multiplier);
4199 (masm.*cnt)(x4, w4, SVE_VL4, multiplier);
4200 (masm.*cnt)(x5, w5, SVE_VL5, multiplier);
4201 (masm.*cnt)(x6, w6, SVE_VL6, multiplier);
4202 (masm.*cnt)(x7, w7, SVE_VL7, multiplier);
4203 (masm.*cnt)(x8, w8, SVE_VL8, multiplier);
4204 (masm.*cnt)(x9, w9, SVE_VL16, multiplier);
4205 (masm.*cnt)(x10, w10, SVE_VL32, multiplier);
4206 (masm.*cnt)(x11, w11, SVE_VL64, multiplier);
4207 (masm.*cnt)(x12, w12, SVE_VL128, multiplier);
4208 (masm.*cnt)(x13, w13, SVE_VL256, multiplier);
4209 (masm.*cnt)(x14, w14, 16, multiplier);
4210 (masm.*cnt)(x15, w15, 23, multiplier);
4211 (masm.*cnt)(x18, w18, 28, multiplier);
4212 (masm.*cnt)(x19, w19, SVE_MUL4, multiplier);
4213 (masm.*cnt)(x20, w20, SVE_MUL3, multiplier);
4214 (masm.*cnt)(x21, w21, SVE_ALL, multiplier);
4215
4216 END();
4217
4218 if (CAN_RUN()) {
4219 RUN();
4220
4221 int all = core.GetSVELaneCount(lane_size_in_bits);
4222 int pow2 = 1 << HighestSetBitPosition(all);
4223 int mul4 = all - (all % 4);
4224 int mul3 = all - (all % 3);
4225
4226 multiplier = is_increment ? multiplier : -multiplier;
4227
4228 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * pow2), x0);
4229 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(1, all)), x1);
4230 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(2, all)), x2);
4231 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(3, all)), x3);
4232 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(4, all)), x4);
4233 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(5, all)), x5);
4234 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(6, all)), x6);
4235 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(7, all)), x7);
4236 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(8, all)), x8);
4237 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(16, all)), x9);
4238 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(32, all)), x10);
4239 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(64, all)), x11);
4240 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(128, all)), x12);
4241 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * FixedVL(256, all)), x13);
4242 ASSERT_EQUAL_64(acc_value, x14);
4243 ASSERT_EQUAL_64(acc_value, x15);
4244 ASSERT_EQUAL_64(acc_value, x18);
4245 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul4), x19);
4246 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * mul3), x20);
4247 ASSERT_EQUAL_64(QAdd(acc_value, multiplier * all), x21);
4248 }
4249}
4250
4251static void QIncXWHelper(Test* config,
4252 QIncDecXWFn cnt,
4253 int multiplier,
4254 int lane_size_in_bits,
4255 int32_t acc_value) {
4256 QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, true);
4257}
4258
4259static void QDecXWHelper(Test* config,
4260 QIncDecXWFn cnt,
4261 int multiplier,
4262 int lane_size_in_bits,
4263 int32_t acc_value) {
4264 QIncDecXWHelper(config, cnt, multiplier, lane_size_in_bits, acc_value, false);
4265}
4266
4267TEST_SVE(sve_sqdecb_xw) {
4268 QDecXWHelper(config, &MacroAssembler::Sqdecb, 1, kBRegSize, 1);
4269 QDecXWHelper(config, &MacroAssembler::Sqdecb, 2, kBRegSize, INT32_MIN + 42);
4270 QDecXWHelper(config, &MacroAssembler::Sqdecb, 15, kBRegSize, 999);
4271 QDecXWHelper(config, &MacroAssembler::Sqdecb, 16, kBRegSize, INT32_MAX - 42);
4272}
4273
4274TEST_SVE(sve_sqdech_xw) {
4275 QDecXWHelper(config, &MacroAssembler::Sqdech, 1, kHRegSize, 1);
4276 QDecXWHelper(config, &MacroAssembler::Sqdech, 2, kHRegSize, INT32_MIN + 42);
4277 QDecXWHelper(config, &MacroAssembler::Sqdech, 15, kHRegSize, 999);
4278 QDecXWHelper(config, &MacroAssembler::Sqdech, 16, kHRegSize, INT32_MAX - 42);
4279}
4280
4281TEST_SVE(sve_sqdecw_xw) {
4282 QDecXWHelper(config, &MacroAssembler::Sqdecw, 1, kWRegSize, 1);
4283 QDecXWHelper(config, &MacroAssembler::Sqdecw, 2, kWRegSize, INT32_MIN + 42);
4284 QDecXWHelper(config, &MacroAssembler::Sqdecw, 15, kWRegSize, 999);
4285 QDecXWHelper(config, &MacroAssembler::Sqdecw, 16, kWRegSize, INT32_MAX - 42);
4286}
4287
4288TEST_SVE(sve_sqdecd_xw) {
4289 QDecXWHelper(config, &MacroAssembler::Sqdecd, 1, kDRegSize, 1);
4290 QDecXWHelper(config, &MacroAssembler::Sqdecd, 2, kDRegSize, INT32_MIN + 42);
4291 QDecXWHelper(config, &MacroAssembler::Sqdecd, 15, kDRegSize, 999);
4292 QDecXWHelper(config, &MacroAssembler::Sqdecd, 16, kDRegSize, INT32_MAX - 42);
4293}
4294
4295TEST_SVE(sve_sqincb_xw) {
4296 QIncXWHelper(config, &MacroAssembler::Sqincb, 1, kBRegSize, 1);
4297 QIncXWHelper(config, &MacroAssembler::Sqincb, 2, kBRegSize, INT32_MIN + 42);
4298 QIncXWHelper(config, &MacroAssembler::Sqincb, 15, kBRegSize, 999);
4299 QIncXWHelper(config, &MacroAssembler::Sqincb, 16, kBRegSize, INT32_MAX - 42);
4300}
4301
4302TEST_SVE(sve_sqinch_xw) {
4303 QIncXWHelper(config, &MacroAssembler::Sqinch, 1, kHRegSize, 1);
4304 QIncXWHelper(config, &MacroAssembler::Sqinch, 2, kHRegSize, INT32_MIN + 42);
4305 QIncXWHelper(config, &MacroAssembler::Sqinch, 15, kHRegSize, 999);
4306 QIncXWHelper(config, &MacroAssembler::Sqinch, 16, kHRegSize, INT32_MAX - 42);
4307}
4308
4309TEST_SVE(sve_sqincw_xw) {
4310 QIncXWHelper(config, &MacroAssembler::Sqincw, 1, kWRegSize, 1);
4311 QIncXWHelper(config, &MacroAssembler::Sqincw, 2, kWRegSize, INT32_MIN + 42);
4312 QIncXWHelper(config, &MacroAssembler::Sqincw, 15, kWRegSize, 999);
4313 QIncXWHelper(config, &MacroAssembler::Sqincw, 16, kWRegSize, INT32_MAX - 42);
4314}
4315
4316TEST_SVE(sve_sqincd_xw) {
4317 QIncXWHelper(config, &MacroAssembler::Sqincd, 1, kDRegSize, 1);
4318 QIncXWHelper(config, &MacroAssembler::Sqincd, 2, kDRegSize, INT32_MIN + 42);
4319 QIncXWHelper(config, &MacroAssembler::Sqincd, 15, kDRegSize, 999);
4320 QIncXWHelper(config, &MacroAssembler::Sqincd, 16, kDRegSize, INT32_MAX - 42);
4321}
4322
Martyn Capewell8188ddf2019-11-21 17:09:34 +00004323typedef void (MacroAssembler::*IncDecZFn)(const ZRegister& dst,
4324 int pattern,
4325 int multiplier);
4326typedef void (MacroAssembler::*AddSubFn)(const ZRegister& dst,
4327 const ZRegister& src1,
4328 const ZRegister& src2);
4329
4330static void IncDecZHelper(Test* config,
4331 IncDecZFn fn,
4332 CntFn cnt,
4333 AddSubFn addsub,
4334 int multiplier,
4335 int lane_size_in_bits) {
4336 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4337 START();
4338
4339 uint64_t acc_inputs[] = {0x7766554433221100,
4340 0xffffffffffffffff,
4341 0x0000000000000000,
4342 0xffffffff0000ffff,
4343 0x7fffffffffffffff,
4344 0x8000000000000000,
4345 0x7fffffff7fff7fff,
4346 0x8000000080008000};
4347
4348 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
4349 for (int j = 0; j < 4; j++) {
4350 InsrHelper(&masm, ZRegister(i, kDRegSize), acc_inputs);
4351 }
4352 }
4353 for (unsigned i = 0; i < 15; i++) {
4354 __ Mov(XRegister(i), 0);
4355 }
4356
4357 (masm.*fn)(z16.WithLaneSize(lane_size_in_bits), SVE_POW2, multiplier);
4358 (masm.*fn)(z17.WithLaneSize(lane_size_in_bits), SVE_VL1, multiplier);
4359 (masm.*fn)(z18.WithLaneSize(lane_size_in_bits), SVE_VL2, multiplier);
4360 (masm.*fn)(z19.WithLaneSize(lane_size_in_bits), SVE_VL3, multiplier);
4361 (masm.*fn)(z20.WithLaneSize(lane_size_in_bits), SVE_VL4, multiplier);
4362 (masm.*fn)(z21.WithLaneSize(lane_size_in_bits), SVE_VL7, multiplier);
4363 (masm.*fn)(z22.WithLaneSize(lane_size_in_bits), SVE_VL8, multiplier);
4364 (masm.*fn)(z23.WithLaneSize(lane_size_in_bits), SVE_VL16, multiplier);
4365 (masm.*fn)(z24.WithLaneSize(lane_size_in_bits), SVE_VL64, multiplier);
4366 (masm.*fn)(z25.WithLaneSize(lane_size_in_bits), SVE_VL256, multiplier);
4367 (masm.*fn)(z26.WithLaneSize(lane_size_in_bits), 16, multiplier);
4368 (masm.*fn)(z27.WithLaneSize(lane_size_in_bits), 28, multiplier);
4369 (masm.*fn)(z28.WithLaneSize(lane_size_in_bits), SVE_MUL3, multiplier);
4370 (masm.*fn)(z29.WithLaneSize(lane_size_in_bits), SVE_MUL4, multiplier);
4371 (masm.*fn)(z30.WithLaneSize(lane_size_in_bits), SVE_ALL, multiplier);
4372
4373 // Perform computation using alternative instructions.
4374 (masm.*cnt)(x0, SVE_POW2, multiplier);
4375 (masm.*cnt)(x1, SVE_VL1, multiplier);
4376 (masm.*cnt)(x2, SVE_VL2, multiplier);
4377 (masm.*cnt)(x3, SVE_VL3, multiplier);
4378 (masm.*cnt)(x4, SVE_VL4, multiplier);
4379 (masm.*cnt)(x5, SVE_VL7, multiplier);
4380 (masm.*cnt)(x6, SVE_VL8, multiplier);
4381 (masm.*cnt)(x7, SVE_VL16, multiplier);
4382 (masm.*cnt)(x8, SVE_VL64, multiplier);
4383 (masm.*cnt)(x9, SVE_VL256, multiplier);
4384 (masm.*cnt)(x10, 16, multiplier);
4385 (masm.*cnt)(x11, 28, multiplier);
4386 (masm.*cnt)(x12, SVE_MUL3, multiplier);
4387 (masm.*cnt)(x13, SVE_MUL4, multiplier);
4388 (masm.*cnt)(x14, SVE_ALL, multiplier);
4389
4390 ZRegister zscratch = z15.WithLaneSize(lane_size_in_bits);
4391 for (unsigned i = 0; i < 15; i++) {
4392 ZRegister zsrcdst = ZRegister(i, lane_size_in_bits);
4393 Register x = Register(i, kXRegSize);
4394 __ Dup(zscratch, x);
4395 (masm.*addsub)(zsrcdst, zsrcdst, zscratch);
4396 }
4397
4398 END();
4399
4400 if (CAN_RUN()) {
4401 RUN();
4402
4403 ASSERT_EQUAL_SVE(z0, z16);
4404 ASSERT_EQUAL_SVE(z1, z17);
4405 ASSERT_EQUAL_SVE(z2, z18);
4406 ASSERT_EQUAL_SVE(z3, z19);
4407 ASSERT_EQUAL_SVE(z4, z20);
4408 ASSERT_EQUAL_SVE(z5, z21);
4409 ASSERT_EQUAL_SVE(z6, z22);
4410 ASSERT_EQUAL_SVE(z7, z23);
4411 ASSERT_EQUAL_SVE(z8, z24);
4412 ASSERT_EQUAL_SVE(z9, z25);
4413 ASSERT_EQUAL_SVE(z10, z26);
4414 ASSERT_EQUAL_SVE(z11, z27);
4415 ASSERT_EQUAL_SVE(z12, z28);
4416 ASSERT_EQUAL_SVE(z13, z29);
4417 ASSERT_EQUAL_SVE(z14, z30);
4418 }
4419}
4420
4421TEST_SVE(sve_inc_dec_vec) {
4422 CntFn cnth = &MacroAssembler::Cnth;
4423 CntFn cntw = &MacroAssembler::Cntw;
4424 CntFn cntd = &MacroAssembler::Cntd;
4425 AddSubFn sub = &MacroAssembler::Sub;
4426 AddSubFn add = &MacroAssembler::Add;
4427 for (int mult = 1; mult <= 16; mult += 5) {
4428 IncDecZHelper(config, &MacroAssembler::Dech, cnth, sub, mult, kHRegSize);
4429 IncDecZHelper(config, &MacroAssembler::Decw, cntw, sub, mult, kSRegSize);
4430 IncDecZHelper(config, &MacroAssembler::Decd, cntd, sub, mult, kDRegSize);
4431 IncDecZHelper(config, &MacroAssembler::Inch, cnth, add, mult, kHRegSize);
4432 IncDecZHelper(config, &MacroAssembler::Incw, cntw, add, mult, kSRegSize);
4433 IncDecZHelper(config, &MacroAssembler::Incd, cntd, add, mult, kDRegSize);
4434 }
4435}
4436
4437TEST_SVE(sve_unsigned_sat_inc_dec_vec) {
4438 CntFn cnth = &MacroAssembler::Cnth;
4439 CntFn cntw = &MacroAssembler::Cntw;
4440 CntFn cntd = &MacroAssembler::Cntd;
4441 AddSubFn sub = &MacroAssembler::Uqsub;
4442 AddSubFn add = &MacroAssembler::Uqadd;
4443 for (int mult = 1; mult <= 16; mult += 5) {
4444 IncDecZHelper(config, &MacroAssembler::Uqdech, cnth, sub, mult, kHRegSize);
4445 IncDecZHelper(config, &MacroAssembler::Uqdecw, cntw, sub, mult, kSRegSize);
4446 IncDecZHelper(config, &MacroAssembler::Uqdecd, cntd, sub, mult, kDRegSize);
4447 IncDecZHelper(config, &MacroAssembler::Uqinch, cnth, add, mult, kHRegSize);
4448 IncDecZHelper(config, &MacroAssembler::Uqincw, cntw, add, mult, kSRegSize);
4449 IncDecZHelper(config, &MacroAssembler::Uqincd, cntd, add, mult, kDRegSize);
4450 }
4451}
4452
4453TEST_SVE(sve_signed_sat_inc_dec_vec) {
4454 CntFn cnth = &MacroAssembler::Cnth;
4455 CntFn cntw = &MacroAssembler::Cntw;
4456 CntFn cntd = &MacroAssembler::Cntd;
4457 AddSubFn sub = &MacroAssembler::Sqsub;
4458 AddSubFn add = &MacroAssembler::Sqadd;
4459 for (int mult = 1; mult <= 16; mult += 5) {
4460 IncDecZHelper(config, &MacroAssembler::Sqdech, cnth, sub, mult, kHRegSize);
4461 IncDecZHelper(config, &MacroAssembler::Sqdecw, cntw, sub, mult, kSRegSize);
4462 IncDecZHelper(config, &MacroAssembler::Sqdecd, cntd, sub, mult, kDRegSize);
4463 IncDecZHelper(config, &MacroAssembler::Sqinch, cnth, add, mult, kHRegSize);
4464 IncDecZHelper(config, &MacroAssembler::Sqincw, cntw, add, mult, kSRegSize);
4465 IncDecZHelper(config, &MacroAssembler::Sqincd, cntd, add, mult, kDRegSize);
4466 }
4467}
4468
TatWai Chong7a0d3672019-10-23 17:35:18 -07004469typedef void (MacroAssembler::*ArithPredicatedFn)(const ZRegister& zd,
4470 const PRegisterM& pg,
4471 const ZRegister& zn,
4472 const ZRegister& zm);
TatWai Chong13634762019-07-16 16:20:45 -07004473
4474template <typename Td, typename Tg, typename Tn>
4475static void IntBinArithHelper(Test* config,
TatWai Chong7a0d3672019-10-23 17:35:18 -07004476 ArithPredicatedFn macro,
TatWai Chong13634762019-07-16 16:20:45 -07004477 unsigned lane_size_in_bits,
4478 const Tg& pg_inputs,
4479 const Tn& zn_inputs,
4480 const Tn& zm_inputs,
4481 const Td& zd_expected) {
4482 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
4483 START();
4484
Richard Neillb6725cf2023-02-24 13:32:42 +00004485 ZRegister src_a = z30.WithLaneSize(lane_size_in_bits);
TatWai Chong13634762019-07-16 16:20:45 -07004486 ZRegister src_b = z27.WithLaneSize(lane_size_in_bits);
4487 InsrHelper(&masm, src_a, zn_inputs);
4488 InsrHelper(&masm, src_b, zm_inputs);
4489
4490 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
4491
4492 ZRegister zd_1 = z0.WithLaneSize(lane_size_in_bits);
4493 ZRegister zd_2 = z1.WithLaneSize(lane_size_in_bits);
4494 ZRegister zd_3 = z2.WithLaneSize(lane_size_in_bits);
4495
4496 // `instr` zd(dst), zd(src_a), zn(src_b)
4497 __ Mov(zd_1, src_a);
4498 (masm.*macro)(zd_1, p0.Merging(), zd_1, src_b);
4499
4500 // `instr` zd(dst), zm(src_a), zd(src_b)
4501 // Based on whether zd and zm registers are aliased, the macro of instructions
4502 // (`Instr`) swaps the order of operands if it has the commutative property,
4503 // otherwise, transfer to the reversed `Instr`, such as subr and divr.
4504 __ Mov(zd_2, src_b);
4505 (masm.*macro)(zd_2, p0.Merging(), src_a, zd_2);
4506
4507 // `instr` zd(dst), zm(src_a), zn(src_b)
4508 // The macro of instructions (`Instr`) automatically selects between `instr`
4509 // and movprfx + `instr` based on whether zd and zn registers are aliased.
TatWai Chongd316c5e2019-10-16 12:22:10 -07004510 // A generated movprfx instruction is predicated that using the same
TatWai Chong13634762019-07-16 16:20:45 -07004511 // governing predicate register. In order to keep the result constant,
4512 // initialize the destination register first.
4513 __ Mov(zd_3, src_a);
4514 (masm.*macro)(zd_3, p0.Merging(), src_a, src_b);
4515
4516 END();
4517
4518 if (CAN_RUN()) {
4519 RUN();
4520 ASSERT_EQUAL_SVE(zd_expected, zd_1);
4521
4522 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
4523 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
4524 if (!core.HasSVELane(zd_1, lane)) break;
TatWai Chongd316c5e2019-10-16 12:22:10 -07004525 if ((pg_inputs[i] & 1) != 0) {
TatWai Chong13634762019-07-16 16:20:45 -07004526 ASSERT_EQUAL_SVE_LANE(zd_expected[i], zd_1, lane);
4527 } else {
4528 ASSERT_EQUAL_SVE_LANE(zn_inputs[i], zd_1, lane);
4529 }
4530 }
4531
4532 ASSERT_EQUAL_SVE(zd_expected, zd_3);
4533 }
4534}
4535
4536TEST_SVE(sve_binary_arithmetic_predicated_add) {
4537 // clang-format off
4538 unsigned zn_b[] = {0x00, 0x01, 0x10, 0x81, 0xff, 0x0f, 0x01, 0x7f};
4539
4540 unsigned zm_b[] = {0x00, 0x01, 0x10, 0x00, 0x81, 0x80, 0xff, 0xff};
4541
4542 unsigned zn_h[] = {0x0000, 0x0123, 0x1010, 0x8181, 0xffff, 0x0f0f, 0x0101, 0x7f7f};
4543
4544 unsigned zm_h[] = {0x0000, 0x0123, 0x1010, 0x0000, 0x8181, 0x8080, 0xffff, 0xffff};
4545
4546 unsigned zn_s[] = {0x00000000, 0x01234567, 0x10101010, 0x81818181,
4547 0xffffffff, 0x0f0f0f0f, 0x01010101, 0x7f7f7f7f};
4548
4549 unsigned zm_s[] = {0x00000000, 0x01234567, 0x10101010, 0x00000000,
4550 0x81818181, 0x80808080, 0xffffffff, 0xffffffff};
4551
4552 uint64_t zn_d[] = {0x0000000000000000, 0x0123456789abcdef,
4553 0x1010101010101010, 0x8181818181818181,
4554 0xffffffffffffffff, 0x0f0f0f0f0f0f0f0f,
4555 0x0101010101010101, 0x7f7f7f7fffffffff};
4556
4557 uint64_t zm_d[] = {0x0000000000000000, 0x0123456789abcdef,
4558 0x1010101010101010, 0x0000000000000000,
4559 0x8181818181818181, 0x8080808080808080,
4560 0xffffffffffffffff, 0xffffffffffffffff};
4561
4562 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4563 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4564 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4565 int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4566
4567 unsigned add_exp_b[] = {0x00, 0x02, 0x20, 0x81, 0x80, 0x8f, 0x00, 0x7f};
4568
4569 unsigned add_exp_h[] = {0x0000, 0x0246, 0x1010, 0x8181,
4570 0x8180, 0x8f8f, 0x0101, 0x7f7e};
4571
4572 unsigned add_exp_s[] = {0x00000000, 0x01234567, 0x20202020, 0x81818181,
4573 0x81818180, 0x0f0f0f0f, 0x01010100, 0x7f7f7f7e};
4574
4575 uint64_t add_exp_d[] = {0x0000000000000000, 0x02468acf13579bde,
4576 0x2020202020202020, 0x8181818181818181,
4577 0xffffffffffffffff, 0x8f8f8f8f8f8f8f8f,
4578 0x0101010101010100, 0x7f7f7f7ffffffffe};
4579
TatWai Chong7a0d3672019-10-23 17:35:18 -07004580 ArithPredicatedFn fn = &MacroAssembler::Add;
TatWai Chong13634762019-07-16 16:20:45 -07004581 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, add_exp_b);
4582 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, add_exp_h);
4583 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, add_exp_s);
4584 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, add_exp_d);
4585
4586 unsigned sub_exp_b[] = {0x00, 0x00, 0x00, 0x81, 0x7e, 0x8f, 0x02, 0x7f};
4587
4588 unsigned sub_exp_h[] = {0x0000, 0x0000, 0x1010, 0x8181,
4589 0x7e7e, 0x8e8f, 0x0101, 0x7f80};
4590
4591 unsigned sub_exp_s[] = {0x00000000, 0x01234567, 0x00000000, 0x81818181,
4592 0x7e7e7e7e, 0x0f0f0f0f, 0x01010102, 0x7f7f7f80};
4593
4594 uint64_t sub_exp_d[] = {0x0000000000000000, 0x0000000000000000,
4595 0x0000000000000000, 0x8181818181818181,
4596 0xffffffffffffffff, 0x8e8e8e8e8e8e8e8f,
4597 0x0101010101010102, 0x7f7f7f8000000000};
4598
4599 fn = &MacroAssembler::Sub;
4600 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sub_exp_b);
4601 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sub_exp_h);
4602 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sub_exp_s);
4603 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sub_exp_d);
4604 // clang-format on
4605}
4606
4607TEST_SVE(sve_binary_arithmetic_predicated_umin_umax_uabd) {
4608 // clang-format off
4609 unsigned zn_b[] = {0x00, 0xff, 0x0f, 0xff, 0xf0, 0x98, 0x55, 0x67};
4610
4611 unsigned zm_b[] = {0x01, 0x00, 0x0e, 0xfe, 0xfe, 0xab, 0xcd, 0x78};
4612
4613 unsigned zn_h[] = {0x0000, 0xffff, 0x00ff, 0xffff,
4614 0xff00, 0xba98, 0x5555, 0x4567};
4615
4616 unsigned zm_h[] = {0x0001, 0x0000, 0x00ee, 0xfffe,
4617 0xfe00, 0xabab, 0xcdcd, 0x5678};
4618
4619 unsigned zn_s[] = {0x00000000, 0xffffffff, 0x0000ffff, 0xffffffff,
4620 0xffff0000, 0xfedcba98, 0x55555555, 0x01234567};
4621
4622 unsigned zm_s[] = {0x00000001, 0x00000000, 0x0000eeee, 0xfffffffe,
4623 0xfffe0000, 0xabababab, 0xcdcdcdcd, 0x12345678};
4624
4625 uint64_t zn_d[] = {0x0000000000000000, 0xffffffffffffffff,
4626 0x5555555555555555, 0x0000000001234567};
4627
4628 uint64_t zm_d[] = {0x0000000000000001, 0x0000000000000000,
4629 0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4630
4631 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4632 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4633 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4634 int pg_d[] = {1, 0, 1, 1};
4635
4636 unsigned umax_exp_b[] = {0x01, 0xff, 0x0f, 0xff, 0xfe, 0xab, 0xcd, 0x67};
4637
4638 unsigned umax_exp_h[] = {0x0001, 0xffff, 0x00ff, 0xffff,
4639 0xff00, 0xba98, 0x5555, 0x5678};
4640
4641 unsigned umax_exp_s[] = {0x00000001, 0xffffffff, 0x0000ffff, 0xffffffff,
4642 0xffff0000, 0xfedcba98, 0xcdcdcdcd, 0x12345678};
4643
4644 uint64_t umax_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4645 0xcdcdcdcdcdcdcdcd, 0x0000000012345678};
4646
TatWai Chong7a0d3672019-10-23 17:35:18 -07004647 ArithPredicatedFn fn = &MacroAssembler::Umax;
TatWai Chong13634762019-07-16 16:20:45 -07004648 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umax_exp_b);
4649 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umax_exp_h);
4650 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umax_exp_s);
4651 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umax_exp_d);
4652
4653 unsigned umin_exp_b[] = {0x00, 0x00, 0x0e, 0xff, 0xf0, 0x98, 0x55, 0x67};
4654
4655 unsigned umin_exp_h[] = {0x0000, 0x0000, 0x00ff, 0xfffe,
4656 0xfe00, 0xabab, 0x5555, 0x4567};
4657
4658 unsigned umin_exp_s[] = {0x00000000, 0xffffffff, 0x0000eeee, 0xfffffffe,
4659 0xfffe0000, 0xfedcba98, 0x55555555, 0x01234567};
4660
4661 uint64_t umin_exp_d[] = {0x0000000000000000, 0xffffffffffffffff,
4662 0x5555555555555555, 0x0000000001234567};
4663 fn = &MacroAssembler::Umin;
4664 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umin_exp_b);
4665 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umin_exp_h);
4666 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umin_exp_s);
4667 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umin_exp_d);
4668
4669 unsigned uabd_exp_b[] = {0x01, 0xff, 0x01, 0xff, 0x0e, 0x13, 0x78, 0x67};
4670
4671 unsigned uabd_exp_h[] = {0x0001, 0xffff, 0x00ff, 0x0001,
4672 0x0100, 0x0eed, 0x5555, 0x1111};
4673
4674 unsigned uabd_exp_s[] = {0x00000001, 0xffffffff, 0x00001111, 0x00000001,
4675 0x00010000, 0xfedcba98, 0x78787878, 0x11111111};
4676
4677 uint64_t uabd_exp_d[] = {0x0000000000000001, 0xffffffffffffffff,
4678 0x7878787878787878, 0x0000000011111111};
4679
4680 fn = &MacroAssembler::Uabd;
4681 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, uabd_exp_b);
4682 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, uabd_exp_h);
4683 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, uabd_exp_s);
4684 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, uabd_exp_d);
4685 // clang-format on
4686}
4687
4688TEST_SVE(sve_binary_arithmetic_predicated_smin_smax_sabd) {
4689 // clang-format off
4690 int zn_b[] = {0, -128, -128, -128, -128, 127, 127, 1};
4691
4692 int zm_b[] = {-1, 0, -1, -127, 127, 126, -1, 0};
4693
4694 int zn_h[] = {0, INT16_MIN, INT16_MIN, INT16_MIN,
4695 INT16_MIN, INT16_MAX, INT16_MAX, 1};
4696
4697 int zm_h[] = {-1, 0, -1, INT16_MIN + 1,
4698 INT16_MAX, INT16_MAX - 1, -1, 0};
4699
4700 int zn_s[] = {0, INT32_MIN, INT32_MIN, INT32_MIN,
4701 INT32_MIN, INT32_MAX, INT32_MAX, 1};
4702
4703 int zm_s[] = {-1, 0, -1, -INT32_MAX,
4704 INT32_MAX, INT32_MAX - 1, -1, 0};
4705
4706 int64_t zn_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4707 INT64_MIN, INT64_MAX, INT64_MAX, 1};
4708
4709 int64_t zm_d[] = {-1, 0, -1, INT64_MIN + 1,
4710 INT64_MAX, INT64_MAX - 1, -1, 0};
4711
4712 int pg_b[] = {1, 1, 1, 0, 1, 1, 1, 0};
4713 int pg_h[] = {1, 1, 0, 1, 1, 1, 0, 1};
4714 int pg_s[] = {1, 0, 1, 1, 1, 0, 1, 1};
4715 int pg_d[] = {0, 1, 1, 1, 0, 1, 1, 1};
4716
4717 int smax_exp_b[] = {0, 0, -1, -128, 127, 127, 127, 1};
4718
4719 int smax_exp_h[] = {0, 0, INT16_MIN, INT16_MIN + 1,
4720 INT16_MAX, INT16_MAX, INT16_MAX, 1};
4721
4722 int smax_exp_s[] = {0, INT32_MIN, -1, INT32_MIN + 1,
4723 INT32_MAX, INT32_MAX, INT32_MAX, 1};
4724
4725 int64_t smax_exp_d[] = {0, 0, -1, INT64_MIN + 1,
4726 INT64_MIN, INT64_MAX, INT64_MAX, 1};
4727
TatWai Chong7a0d3672019-10-23 17:35:18 -07004728 ArithPredicatedFn fn = &MacroAssembler::Smax;
TatWai Chong13634762019-07-16 16:20:45 -07004729 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smax_exp_b);
4730 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smax_exp_h);
4731 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smax_exp_s);
4732 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smax_exp_d);
4733
4734 int smin_exp_b[] = {-1, -128, -128, -128, -128, 126, -1, 1};
4735
4736 int smin_exp_h[] = {-1, INT16_MIN, INT16_MIN, INT16_MIN,
4737 INT16_MIN, INT16_MAX - 1, INT16_MAX, 0};
4738
4739 int smin_exp_s[] = {-1, INT32_MIN, INT32_MIN, INT32_MIN,
4740 INT32_MIN, INT32_MAX, -1, 0};
4741
4742 int64_t smin_exp_d[] = {0, INT64_MIN, INT64_MIN, INT64_MIN,
4743 INT64_MIN, INT64_MAX - 1, -1, 0};
4744
4745 fn = &MacroAssembler::Smin;
4746 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, smin_exp_b);
4747 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, smin_exp_h);
4748 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, smin_exp_s);
4749 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, smin_exp_d);
4750
4751 unsigned sabd_exp_b[] = {1, 128, 127, 128, 255, 1, 128, 1};
4752
4753 unsigned sabd_exp_h[] = {1, 0x8000, 0x8000, 1, 0xffff, 1, 0x7fff, 1};
4754
4755 unsigned sabd_exp_s[] = {1, 0x80000000, 0x7fffffff, 1,
4756 0xffffffff, 0x7fffffff, 0x80000000, 1};
4757
4758 uint64_t sabd_exp_d[] = {0, 0x8000000000000000, 0x7fffffffffffffff, 1,
4759 0x8000000000000000, 1, 0x8000000000000000, 1};
4760
4761 fn = &MacroAssembler::Sabd;
4762 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, sabd_exp_b);
4763 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, sabd_exp_h);
4764 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, sabd_exp_s);
4765 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, sabd_exp_d);
4766 // clang-format on
4767}
4768
4769TEST_SVE(sve_binary_arithmetic_predicated_mul_umulh) {
4770 // clang-format off
4771 unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4772
4773 unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4774
4775 unsigned zn_h[] = {0x0000, 0x0001, 0x0020, 0x0800,
4776 0x8000, 0xff00, 0x5555, 0xaaaa};
4777
4778 unsigned zm_h[] = {0x007f, 0x00cd, 0x0800, 0xffff,
4779 0x5555, 0xaaaa, 0x0001, 0x1234};
4780
4781 unsigned zn_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4782 0x12345678, 0xffffffff, 0x55555555, 0xaaaaaaaa};
4783
4784 unsigned zm_s[] = {0x00000000, 0x00000001, 0x00200020, 0x08000800,
4785 0x12345678, 0x22223333, 0x55556666, 0x77778888};
4786
4787 uint64_t zn_d[] = {0x0000000000000000, 0x5555555555555555,
4788 0xffffffffffffffff, 0xaaaaaaaaaaaaaaaa};
4789
4790 uint64_t zm_d[] = {0x0000000000000000, 0x1111111133333333,
4791 0xddddddddeeeeeeee, 0xaaaaaaaaaaaaaaaa};
4792
4793 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4794 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4795 int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4796 int pg_d[] = {1, 1, 0, 1};
4797
4798 unsigned mul_exp_b[] = {0x00, 0xcd, 0x00, 0xf8, 0x80, 0x56, 0x00, 0x50};
4799
4800 unsigned mul_exp_h[] = {0x0000, 0x0001, 0x0000, 0xf800,
4801 0x8000, 0xff00, 0x5555, 0x9e88};
4802
4803 unsigned mul_exp_s[] = {0x00000000, 0x00000001, 0x00200020, 0x00400000,
4804 0x1df4d840, 0xddddcccd, 0x55555555, 0xb05afa50};
4805
4806 uint64_t mul_exp_d[] = {0x0000000000000000, 0xa4fa4fa4eeeeeeef,
4807 0xffffffffffffffff, 0x38e38e38e38e38e4};
4808
TatWai Chong7a0d3672019-10-23 17:35:18 -07004809 ArithPredicatedFn fn = &MacroAssembler::Mul;
TatWai Chong13634762019-07-16 16:20:45 -07004810 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, mul_exp_b);
4811 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, mul_exp_h);
4812 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, mul_exp_s);
4813 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, mul_exp_d);
4814
4815 unsigned umulh_exp_b[] = {0x00, 0x00, 0x10, 0x07, 0x80, 0xa9, 0x00, 0x05};
4816
4817 unsigned umulh_exp_h[] = {0x0000, 0x0001, 0x0001, 0x07ff,
4818 0x2aaa, 0xff00, 0x0000, 0x0c22};
4819
4820 unsigned umulh_exp_s[] = {0x00000000, 0x00000000, 0x00200020, 0x00400080,
4821 0x014b66dc, 0x22223332, 0x55555555, 0x4fa505af};
4822
4823 uint64_t umulh_exp_d[] = {0x0000000000000000, 0x05b05b05bbbbbbbb,
4824 0xffffffffffffffff, 0x71c71c71c71c71c6};
4825
4826 fn = &MacroAssembler::Umulh;
4827 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, umulh_exp_b);
4828 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, umulh_exp_h);
4829 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, umulh_exp_s);
4830 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, umulh_exp_d);
4831 // clang-format on
4832}
4833
4834TEST_SVE(sve_binary_arithmetic_predicated_smulh) {
4835 // clang-format off
4836 int zn_b[] = {0, 1, -1, INT8_MIN, INT8_MAX, -1, 100, -3};
4837
4838 int zm_b[] = {0, INT8_MIN, INT8_MIN, INT8_MAX, INT8_MAX, -1, 2, 66};
4839
4840 int zn_h[] = {0, 1, -1, INT16_MIN, INT16_MAX, -1, 10000, -3};
4841
4842 int zm_h[] = {0, INT16_MIN, INT16_MIN, INT16_MAX, INT16_MAX, -1, 2, 6666};
4843
4844 int zn_s[] = {0, 1, -1, INT32_MIN, INT32_MAX, -1, 100000000, -3};
4845
4846 int zm_s[] = {0, INT32_MIN, INT32_MIN, INT32_MAX, INT32_MAX, -1, 2, 66666666};
4847
4848 int64_t zn_d[] = {0, -1, INT64_MIN, INT64_MAX};
4849
4850 int64_t zm_d[] = {INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX};
4851
4852 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4853 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4854 int pg_s[] = {1, 1, 0, 1, 1, 1, 0, 1};
4855 int pg_d[] = {1, 1, 0, 1};
4856
4857 int exp_b[] = {0, -1, 0, -64, INT8_MAX, 0, 0, -1};
4858
4859 int exp_h[] = {0, 1, 0, -16384, 16383, -1, 0, -1};
4860
4861 int exp_s[] = {0, -1, -1, -1073741824, 1073741823, 0, 100000000, -1};
4862
4863 int64_t exp_d[] = {0, -1, INT64_MIN, 4611686018427387903};
4864
TatWai Chong7a0d3672019-10-23 17:35:18 -07004865 ArithPredicatedFn fn = &MacroAssembler::Smulh;
TatWai Chong13634762019-07-16 16:20:45 -07004866 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, exp_b);
4867 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, exp_h);
4868 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
4869 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
4870 // clang-format on
4871}
4872
4873TEST_SVE(sve_binary_arithmetic_predicated_logical) {
4874 // clang-format off
4875 unsigned zn_b[] = {0x00, 0x01, 0x20, 0x08, 0x80, 0xff, 0x55, 0xaa};
4876 unsigned zm_b[] = {0x7f, 0xcd, 0x80, 0xff, 0x55, 0xaa, 0x00, 0x08};
4877
4878 unsigned zn_h[] = {0x0000, 0x0001, 0x2020, 0x0008,
4879 0x8000, 0xffff, 0x5555, 0xaaaa};
4880 unsigned zm_h[] = {0x7fff, 0xabcd, 0x8000, 0xffff,
4881 0x5555, 0xaaaa, 0x0000, 0x0800};
4882
4883 unsigned zn_s[] = {0x00000001, 0x20200008, 0x8000ffff, 0x5555aaaa};
4884 unsigned zm_s[] = {0x7fffabcd, 0x8000ffff, 0x5555aaaa, 0x00000800};
4885
4886 uint64_t zn_d[] = {0xfedcba9876543210, 0x0123456789abcdef,
4887 0x0001200880ff55aa, 0x0022446688aaccee};
4888 uint64_t zm_d[] = {0xffffeeeeddddcccc, 0xccccddddeeeeffff,
4889 0x7fcd80ff55aa0008, 0x1133557799bbddff};
4890
4891 int pg_b[] = {0, 1, 1, 1, 0, 1, 1, 1};
4892 int pg_h[] = {1, 0, 1, 1, 1, 0, 1, 1};
4893 int pg_s[] = {1, 1, 1, 0};
4894 int pg_d[] = {1, 1, 0, 1};
4895
4896 unsigned and_exp_b[] = {0x00, 0x01, 0x00, 0x08, 0x80, 0xaa, 0x00, 0x08};
4897
4898 unsigned and_exp_h[] = {0x0000, 0x0001, 0x0000, 0x0008,
4899 0x0000, 0xffff, 0x0000, 0x0800};
4900
4901 unsigned and_exp_s[] = {0x00000001, 0x00000008, 0x0000aaaa, 0x5555aaaa};
4902
4903 uint64_t and_exp_d[] = {0xfedcaa8854540000, 0x0000454588aacdef,
4904 0x0001200880ff55aa, 0x0022446688aaccee};
4905
TatWai Chong7a0d3672019-10-23 17:35:18 -07004906 ArithPredicatedFn fn = &MacroAssembler::And;
TatWai Chong13634762019-07-16 16:20:45 -07004907 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, and_exp_b);
4908 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, and_exp_h);
4909 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, and_exp_s);
4910 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, and_exp_d);
4911
4912 unsigned bic_exp_b[] = {0x00, 0x00, 0x20, 0x00, 0x80, 0x55, 0x55, 0xa2};
4913
4914 unsigned bic_exp_h[] = {0x0000, 0x0001, 0x2020, 0x0000,
4915 0x8000, 0xffff, 0x5555, 0xa2aa};
4916
4917 unsigned bic_exp_s[] = {0x00000000, 0x20200000, 0x80005555, 0x5555aaaa};
4918
4919 uint64_t bic_exp_d[] = {0x0000101022003210, 0x0123002201010000,
4920 0x0001200880ff55aa, 0x0000000000000000};
4921
4922 fn = &MacroAssembler::Bic;
4923 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, bic_exp_b);
4924 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, bic_exp_h);
4925 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, bic_exp_s);
4926 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, bic_exp_d);
4927
4928 unsigned eor_exp_b[] = {0x00, 0xcc, 0xa0, 0xf7, 0x80, 0x55, 0x55, 0xa2};
4929
4930 unsigned eor_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xfff7,
4931 0xd555, 0xffff, 0x5555, 0xa2aa};
4932
4933 unsigned eor_exp_s[] = {0x7fffabcc, 0xa020fff7, 0xd5555555, 0x5555aaaa};
4934
4935 uint64_t eor_exp_d[] = {0x01235476ab89fedc, 0xcdef98ba67453210,
4936 0x0001200880ff55aa, 0x1111111111111111};
4937
4938 fn = &MacroAssembler::Eor;
4939 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, eor_exp_b);
4940 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, eor_exp_h);
4941 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, eor_exp_s);
4942 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, eor_exp_d);
4943
4944 unsigned orr_exp_b[] = {0x00, 0xcd, 0xa0, 0xff, 0x80, 0xff, 0x55, 0xaa};
4945
4946 unsigned orr_exp_h[] = {0x7fff, 0x0001, 0xa020, 0xffff,
4947 0xd555, 0xffff, 0x5555, 0xaaaa};
4948
4949 unsigned orr_exp_s[] = {0x7fffabcd, 0xa020ffff, 0xd555ffff, 0x5555aaaa};
4950
4951 uint64_t orr_exp_d[] = {0xfffffefeffddfedc, 0xcdefddffefefffff,
4952 0x0001200880ff55aa, 0x1133557799bbddff};
4953
4954 fn = &MacroAssembler::Orr;
4955 IntBinArithHelper(config, fn, kBRegSize, pg_b, zn_b, zm_b, orr_exp_b);
4956 IntBinArithHelper(config, fn, kHRegSize, pg_h, zn_h, zm_h, orr_exp_h);
4957 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, orr_exp_s);
4958 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, orr_exp_d);
4959 // clang-format on
4960}
4961
4962TEST_SVE(sve_binary_arithmetic_predicated_sdiv) {
4963 // clang-format off
4964 int zn_s[] = {0, 1, -1, 2468,
4965 INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX,
4966 -11111111, 87654321, 0, 0};
4967
4968 int zm_s[] = {1, -1, 1, 1234,
4969 -1, INT32_MIN, 1, -1,
4970 22222222, 80000000, -1, 0};
4971
4972 int64_t zn_d[] = {0, 1, -1, 2468,
4973 INT64_MIN, INT64_MAX, INT64_MIN, INT64_MAX,
4974 -11111111, 87654321, 0, 0};
4975
4976 int64_t zm_d[] = {1, -1, 1, 1234,
4977 -1, INT64_MIN, 1, -1,
4978 22222222, 80000000, -1, 0};
4979
4980 int pg_s[] = {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0};
4981 int pg_d[] = {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1};
4982
4983 int exp_s[] = {0, 1, -1, 2,
4984 INT32_MIN, 0, INT32_MIN, -INT32_MAX,
4985 0, 1, 0, 0};
4986
4987 int64_t exp_d[] = {0, -1, -1, 2,
4988 INT64_MIN, INT64_MAX, INT64_MIN, -INT64_MAX,
4989 0, 1, 0, 0};
4990
TatWai Chong7a0d3672019-10-23 17:35:18 -07004991 ArithPredicatedFn fn = &MacroAssembler::Sdiv;
TatWai Chong13634762019-07-16 16:20:45 -07004992 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
4993 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
4994 // clang-format on
4995}
4996
4997TEST_SVE(sve_binary_arithmetic_predicated_udiv) {
4998 // clang-format off
4999 unsigned zn_s[] = {0x00000000, 0x00000001, 0xffffffff, 0x80000000,
5000 0xffffffff, 0x80000000, 0xffffffff, 0x0000f000};
5001
5002 unsigned zm_s[] = {0x00000001, 0xffffffff, 0x80000000, 0x00000002,
5003 0x00000000, 0x00000001, 0x00008000, 0xf0000000};
5004
5005 uint64_t zn_d[] = {0x0000000000000000, 0x0000000000000001,
5006 0xffffffffffffffff, 0x8000000000000000,
5007 0xffffffffffffffff, 0x8000000000000000,
5008 0xffffffffffffffff, 0xf0000000f0000000};
5009
5010 uint64_t zm_d[] = {0x0000000000000001, 0xffffffff00000000,
5011 0x8000000000000000, 0x0000000000000002,
5012 0x8888888888888888, 0x0000000000000001,
5013 0x0000000080000000, 0x00000000f0000000};
5014
5015 int pg_s[] = {1, 1, 0, 1, 1, 0, 1, 1};
5016 int pg_d[] = {1, 0, 1, 1, 1, 1, 0, 1};
5017
5018 unsigned exp_s[] = {0x00000000, 0x00000000, 0xffffffff, 0x40000000,
5019 0x00000000, 0x80000000, 0x0001ffff, 0x00000000};
5020
5021 uint64_t exp_d[] = {0x0000000000000000, 0x0000000000000001,
5022 0x0000000000000001, 0x4000000000000000,
5023 0x0000000000000001, 0x8000000000000000,
5024 0xffffffffffffffff, 0x0000000100000001};
5025
TatWai Chong7a0d3672019-10-23 17:35:18 -07005026 ArithPredicatedFn fn = &MacroAssembler::Udiv;
TatWai Chong13634762019-07-16 16:20:45 -07005027 IntBinArithHelper(config, fn, kSRegSize, pg_s, zn_s, zm_s, exp_s);
5028 IntBinArithHelper(config, fn, kDRegSize, pg_d, zn_d, zm_d, exp_d);
5029 // clang-format on
5030}
5031
TatWai Chong7a0d3672019-10-23 17:35:18 -07005032typedef void (MacroAssembler::*ArithFn)(const ZRegister& zd,
5033 const ZRegister& zn,
5034 const ZRegister& zm);
TatWai Chong845246b2019-08-08 00:01:58 -07005035
5036template <typename T>
5037static void IntArithHelper(Test* config,
TatWai Chong7a0d3672019-10-23 17:35:18 -07005038 ArithFn macro,
TatWai Chong845246b2019-08-08 00:01:58 -07005039 unsigned lane_size_in_bits,
5040 const T& zn_inputs,
5041 const T& zm_inputs,
5042 const T& zd_expected) {
5043 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5044 START();
5045
5046 ZRegister zn = z31.WithLaneSize(lane_size_in_bits);
5047 ZRegister zm = z27.WithLaneSize(lane_size_in_bits);
5048 InsrHelper(&masm, zn, zn_inputs);
5049 InsrHelper(&masm, zm, zm_inputs);
5050
5051 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
5052 (masm.*macro)(zd, zn, zm);
5053
5054 END();
5055
5056 if (CAN_RUN()) {
5057 RUN();
5058 ASSERT_EQUAL_SVE(zd_expected, zd);
5059 }
5060}
5061
5062TEST_SVE(sve_arithmetic_unpredicated_add_sqadd_uqadd) {
5063 // clang-format off
TatWai Chong6995bfd2019-09-26 10:48:05 +01005064 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xaa, 0x55, 0xff, 0xf0};
5065 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa, 0x5555, 0xffff, 0xf0f0};
5066 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0x10001010, 0xaaaaaaaa, 0xf000f0f0};
5067 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
TatWai Chong845246b2019-08-08 00:01:58 -07005068 0x1000000010001010, 0xf0000000f000f0f0};
5069
TatWai Chong7a0d3672019-10-23 17:35:18 -07005070 ArithFn fn = &MacroAssembler::Add;
TatWai Chong845246b2019-08-08 00:01:58 -07005071
5072 unsigned add_exp_b[] = {0x02, 0xfe, 0x20, 0x54, 0xaa, 0xfe, 0xe0};
5073 unsigned add_exp_h[] = {0x0302, 0xfefe, 0x2020, 0x5554, 0xaaaa, 0xfffe, 0xe1e0};
5074 unsigned add_exp_s[] = {0x00030302, 0xfffefefe, 0x20002020, 0x55555554, 0xe001e1e0};
5075 uint64_t add_exp_d[] = {0x0000000300030302, 0xfffffffefffefefe,
5076 0x2000000020002020, 0xe0000001e001e1e0};
5077
TatWai Chong6995bfd2019-09-26 10:48:05 +01005078 IntArithHelper(config, fn, kBRegSize, in_b, in_b, add_exp_b);
5079 IntArithHelper(config, fn, kHRegSize, in_h, in_h, add_exp_h);
5080 IntArithHelper(config, fn, kSRegSize, in_s, in_s, add_exp_s);
5081 IntArithHelper(config, fn, kDRegSize, in_d, in_d, add_exp_d);
TatWai Chong845246b2019-08-08 00:01:58 -07005082
5083 fn = &MacroAssembler::Sqadd;
5084
5085 unsigned sqadd_exp_b[] = {0x80, 0x7f, 0x20, 0x80, 0x7f, 0xfe, 0xe0};
5086 unsigned sqadd_exp_h[] = {0x8000, 0x7fff, 0x2020, 0x8000, 0x7fff, 0xfffe, 0xe1e0};
5087 unsigned sqadd_exp_s[] = {0x80000000, 0x7fffffff, 0x20002020, 0x80000000, 0xe001e1e0};
5088 uint64_t sqadd_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5089 0x2000000020002020, 0xe0000001e001e1e0};
5090
TatWai Chong6995bfd2019-09-26 10:48:05 +01005091 IntArithHelper(config, fn, kBRegSize, in_b, in_b, sqadd_exp_b);
5092 IntArithHelper(config, fn, kHRegSize, in_h, in_h, sqadd_exp_h);
5093 IntArithHelper(config, fn, kSRegSize, in_s, in_s, sqadd_exp_s);
5094 IntArithHelper(config, fn, kDRegSize, in_d, in_d, sqadd_exp_d);
TatWai Chong845246b2019-08-08 00:01:58 -07005095
5096 fn = &MacroAssembler::Uqadd;
5097
5098 unsigned uqadd_exp_b[] = {0xff, 0xfe, 0x20, 0xff, 0xaa, 0xff, 0xff};
5099 unsigned uqadd_exp_h[] = {0xffff, 0xfefe, 0x2020, 0xffff, 0xaaaa, 0xffff, 0xffff};
5100 unsigned uqadd_exp_s[] = {0xffffffff, 0xfffefefe, 0x20002020, 0xffffffff, 0xffffffff};
5101 uint64_t uqadd_exp_d[] = {0xffffffffffffffff, 0xfffffffefffefefe,
5102 0x2000000020002020, 0xffffffffffffffff};
5103
TatWai Chong6995bfd2019-09-26 10:48:05 +01005104 IntArithHelper(config, fn, kBRegSize, in_b, in_b, uqadd_exp_b);
5105 IntArithHelper(config, fn, kHRegSize, in_h, in_h, uqadd_exp_h);
5106 IntArithHelper(config, fn, kSRegSize, in_s, in_s, uqadd_exp_s);
5107 IntArithHelper(config, fn, kDRegSize, in_d, in_d, uqadd_exp_d);
TatWai Chong845246b2019-08-08 00:01:58 -07005108 // clang-format on
5109}
5110
5111TEST_SVE(sve_arithmetic_unpredicated_sub_sqsub_uqsub) {
5112 // clang-format off
5113
5114 unsigned ins1_b[] = {0x81, 0x7f, 0x7e, 0xaa};
5115 unsigned ins2_b[] = {0x10, 0xf0, 0xf0, 0x55};
5116
5117 unsigned ins1_h[] = {0x8181, 0x7f7f, 0x7e7e, 0xaaaa};
5118 unsigned ins2_h[] = {0x1010, 0xf0f0, 0xf0f0, 0x5555};
5119
5120 unsigned ins1_s[] = {0x80018181, 0x7fff7f7f, 0x7eee7e7e, 0xaaaaaaaa};
5121 unsigned ins2_s[] = {0x10001010, 0xf000f0f0, 0xf000f0f0, 0x55555555};
5122
5123 uint64_t ins1_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f,
5124 0x7eeeeeee7eee7e7e, 0xaaaaaaaaaaaaaaaa};
5125 uint64_t ins2_d[] = {0x1000000010001010, 0xf0000000f000f0f0,
5126 0xf0000000f000f0f0, 0x5555555555555555};
5127
TatWai Chong7a0d3672019-10-23 17:35:18 -07005128 ArithFn fn = &MacroAssembler::Sub;
TatWai Chong845246b2019-08-08 00:01:58 -07005129
5130 unsigned ins1_sub_ins2_exp_b[] = {0x71, 0x8f, 0x8e, 0x55};
5131 unsigned ins1_sub_ins2_exp_h[] = {0x7171, 0x8e8f, 0x8d8e, 0x5555};
5132 unsigned ins1_sub_ins2_exp_s[] = {0x70017171, 0x8ffe8e8f, 0x8eed8d8e, 0x55555555};
5133 uint64_t ins1_sub_ins2_exp_d[] = {0x7000000170017171, 0x8ffffffe8ffe8e8f,
5134 0x8eeeeeed8eed8d8e, 0x5555555555555555};
5135
5136 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sub_ins2_exp_b);
5137 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sub_ins2_exp_h);
5138 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sub_ins2_exp_s);
5139 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sub_ins2_exp_d);
5140
5141 unsigned ins2_sub_ins1_exp_b[] = {0x8f, 0x71, 0x72, 0xab};
5142 unsigned ins2_sub_ins1_exp_h[] = {0x8e8f, 0x7171, 0x7272, 0xaaab};
5143 unsigned ins2_sub_ins1_exp_s[] = {0x8ffe8e8f, 0x70017171, 0x71127272, 0xaaaaaaab};
5144 uint64_t ins2_sub_ins1_exp_d[] = {0x8ffffffe8ffe8e8f, 0x7000000170017171,
5145 0x7111111271127272, 0xaaaaaaaaaaaaaaab};
5146
5147 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sub_ins1_exp_b);
5148 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sub_ins1_exp_h);
5149 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sub_ins1_exp_s);
5150 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sub_ins1_exp_d);
5151
5152 fn = &MacroAssembler::Sqsub;
5153
5154 unsigned ins1_sqsub_ins2_exp_b[] = {0x80, 0x7f, 0x7f, 0x80};
5155 unsigned ins1_sqsub_ins2_exp_h[] = {0x8000, 0x7fff, 0x7fff, 0x8000};
5156 unsigned ins1_sqsub_ins2_exp_s[] = {0x80000000, 0x7fffffff, 0x7fffffff, 0x80000000};
5157 uint64_t ins1_sqsub_ins2_exp_d[] = {0x8000000000000000, 0x7fffffffffffffff,
5158 0x7fffffffffffffff, 0x8000000000000000};
5159
5160 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_sqsub_ins2_exp_b);
5161 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_sqsub_ins2_exp_h);
5162 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_sqsub_ins2_exp_s);
5163 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_sqsub_ins2_exp_d);
5164
5165 unsigned ins2_sqsub_ins1_exp_b[] = {0x7f, 0x80, 0x80, 0x7f};
5166 unsigned ins2_sqsub_ins1_exp_h[] = {0x7fff, 0x8000, 0x8000, 0x7fff};
5167 unsigned ins2_sqsub_ins1_exp_s[] = {0x7fffffff, 0x80000000, 0x80000000, 0x7fffffff};
5168 uint64_t ins2_sqsub_ins1_exp_d[] = {0x7fffffffffffffff, 0x8000000000000000,
5169 0x8000000000000000, 0x7fffffffffffffff};
5170
5171 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_sqsub_ins1_exp_b);
5172 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_sqsub_ins1_exp_h);
5173 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_sqsub_ins1_exp_s);
5174 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_sqsub_ins1_exp_d);
5175
5176 fn = &MacroAssembler::Uqsub;
5177
5178 unsigned ins1_uqsub_ins2_exp_b[] = {0x71, 0x00, 0x00, 0x55};
5179 unsigned ins1_uqsub_ins2_exp_h[] = {0x7171, 0x0000, 0x0000, 0x5555};
5180 unsigned ins1_uqsub_ins2_exp_s[] = {0x70017171, 0x00000000, 0x00000000, 0x55555555};
5181 uint64_t ins1_uqsub_ins2_exp_d[] = {0x7000000170017171, 0x0000000000000000,
5182 0x0000000000000000, 0x5555555555555555};
5183
5184 IntArithHelper(config, fn, kBRegSize, ins1_b, ins2_b, ins1_uqsub_ins2_exp_b);
5185 IntArithHelper(config, fn, kHRegSize, ins1_h, ins2_h, ins1_uqsub_ins2_exp_h);
5186 IntArithHelper(config, fn, kSRegSize, ins1_s, ins2_s, ins1_uqsub_ins2_exp_s);
5187 IntArithHelper(config, fn, kDRegSize, ins1_d, ins2_d, ins1_uqsub_ins2_exp_d);
5188
5189 unsigned ins2_uqsub_ins1_exp_b[] = {0x00, 0x71, 0x72, 0x00};
5190 unsigned ins2_uqsub_ins1_exp_h[] = {0x0000, 0x7171, 0x7272, 0x0000};
5191 unsigned ins2_uqsub_ins1_exp_s[] = {0x00000000, 0x70017171, 0x71127272, 0x00000000};
5192 uint64_t ins2_uqsub_ins1_exp_d[] = {0x0000000000000000, 0x7000000170017171,
5193 0x7111111271127272, 0x0000000000000000};
5194
5195 IntArithHelper(config, fn, kBRegSize, ins2_b, ins1_b, ins2_uqsub_ins1_exp_b);
5196 IntArithHelper(config, fn, kHRegSize, ins2_h, ins1_h, ins2_uqsub_ins1_exp_h);
5197 IntArithHelper(config, fn, kSRegSize, ins2_s, ins1_s, ins2_uqsub_ins1_exp_s);
5198 IntArithHelper(config, fn, kDRegSize, ins2_d, ins1_d, ins2_uqsub_ins1_exp_d);
5199 // clang-format on
5200}
5201
Jacob Bramley9e5da2a2019-08-06 18:52:07 +01005202TEST_SVE(sve_rdvl) {
5203 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5204 START();
5205
5206 // Encodable multipliers.
5207 __ Rdvl(x0, 0);
5208 __ Rdvl(x1, 1);
5209 __ Rdvl(x2, 2);
5210 __ Rdvl(x3, 31);
5211 __ Rdvl(x4, -1);
5212 __ Rdvl(x5, -2);
5213 __ Rdvl(x6, -32);
5214
5215 // For unencodable multipliers, the MacroAssembler uses a sequence of
5216 // instructions.
5217 __ Rdvl(x10, 32);
5218 __ Rdvl(x11, -33);
5219 __ Rdvl(x12, 42);
5220 __ Rdvl(x13, -42);
5221
5222 // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5223 // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5224 // occurs in the macro.
5225 __ Rdvl(x14, 0x007fffffffffffff);
5226 __ Rdvl(x15, -0x0080000000000000);
5227
5228 END();
5229
5230 if (CAN_RUN()) {
5231 RUN();
5232
5233 uint64_t vl = config->sve_vl_in_bytes();
5234
5235 ASSERT_EQUAL_64(vl * 0, x0);
5236 ASSERT_EQUAL_64(vl * 1, x1);
5237 ASSERT_EQUAL_64(vl * 2, x2);
5238 ASSERT_EQUAL_64(vl * 31, x3);
5239 ASSERT_EQUAL_64(vl * -1, x4);
5240 ASSERT_EQUAL_64(vl * -2, x5);
5241 ASSERT_EQUAL_64(vl * -32, x6);
5242
5243 ASSERT_EQUAL_64(vl * 32, x10);
5244 ASSERT_EQUAL_64(vl * -33, x11);
5245 ASSERT_EQUAL_64(vl * 42, x12);
5246 ASSERT_EQUAL_64(vl * -42, x13);
5247
5248 ASSERT_EQUAL_64(vl * 0x007fffffffffffff, x14);
5249 ASSERT_EQUAL_64(vl * 0xff80000000000000, x15);
5250 }
5251}
5252
5253TEST_SVE(sve_rdpl) {
5254 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5255 START();
5256
5257 // There is no `rdpl` instruction, so the MacroAssembler maps `Rdpl` onto
5258 // Addpl(xd, xzr, ...).
5259
5260 // Encodable multipliers (as `addvl`).
5261 __ Rdpl(x0, 0);
5262 __ Rdpl(x1, 8);
5263 __ Rdpl(x2, 248);
5264 __ Rdpl(x3, -8);
5265 __ Rdpl(x4, -256);
5266
5267 // Encodable multipliers (as `movz` + `addpl`).
5268 __ Rdpl(x7, 31);
Jacob Bramley889984c2019-10-28 17:28:48 +00005269 __ Rdpl(x8, -31);
Jacob Bramley9e5da2a2019-08-06 18:52:07 +01005270
5271 // For unencodable multipliers, the MacroAssembler uses a sequence of
5272 // instructions.
5273 __ Rdpl(x10, 42);
5274 __ Rdpl(x11, -42);
5275
5276 // The maximum value of VL is 256 (bytes), so the multiplier is limited to the
5277 // range [INT64_MIN/256, INT64_MAX/256], to ensure that no signed overflow
5278 // occurs in the macro.
5279 __ Rdpl(x12, 0x007fffffffffffff);
5280 __ Rdpl(x13, -0x0080000000000000);
5281
5282 END();
5283
5284 if (CAN_RUN()) {
5285 RUN();
5286
5287 uint64_t vl = config->sve_vl_in_bytes();
5288 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5289 uint64_t pl = vl / kZRegBitsPerPRegBit;
5290
5291 ASSERT_EQUAL_64(pl * 0, x0);
5292 ASSERT_EQUAL_64(pl * 8, x1);
5293 ASSERT_EQUAL_64(pl * 248, x2);
5294 ASSERT_EQUAL_64(pl * -8, x3);
5295 ASSERT_EQUAL_64(pl * -256, x4);
5296
5297 ASSERT_EQUAL_64(pl * 31, x7);
Jacob Bramley889984c2019-10-28 17:28:48 +00005298 ASSERT_EQUAL_64(pl * -31, x8);
Jacob Bramley9e5da2a2019-08-06 18:52:07 +01005299
5300 ASSERT_EQUAL_64(pl * 42, x10);
5301 ASSERT_EQUAL_64(pl * -42, x11);
5302
5303 ASSERT_EQUAL_64(pl * 0x007fffffffffffff, x12);
5304 ASSERT_EQUAL_64(pl * 0xff80000000000000, x13);
5305 }
5306}
5307
5308TEST_SVE(sve_addvl) {
5309 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5310 START();
5311
5312 uint64_t base = 0x1234567800000000;
5313 __ Mov(x30, base);
5314
5315 // Encodable multipliers.
5316 __ Addvl(x0, x30, 0);
5317 __ Addvl(x1, x30, 1);
5318 __ Addvl(x2, x30, 31);
5319 __ Addvl(x3, x30, -1);
5320 __ Addvl(x4, x30, -32);
5321
5322 // For unencodable multipliers, the MacroAssembler uses `Rdvl` and `Add`.
5323 __ Addvl(x5, x30, 32);
5324 __ Addvl(x6, x30, -33);
5325
5326 // Test the limits of the multiplier supported by the `Rdvl` macro.
5327 __ Addvl(x7, x30, 0x007fffffffffffff);
5328 __ Addvl(x8, x30, -0x0080000000000000);
5329
5330 // Check that xzr behaves correctly.
5331 __ Addvl(x9, xzr, 8);
5332 __ Addvl(x10, xzr, 42);
5333
5334 // Check that sp behaves correctly with encodable and unencodable multipliers.
5335 __ Addvl(sp, sp, -5);
5336 __ Addvl(sp, sp, -37);
5337 __ Addvl(x11, sp, -2);
5338 __ Addvl(sp, x11, 2);
5339 __ Addvl(x12, sp, -42);
5340
5341 // Restore the value of sp.
5342 __ Addvl(sp, x11, 39);
5343 __ Addvl(sp, sp, 5);
5344
5345 // Adjust x11 and x12 to make the test sp-agnostic.
5346 __ Sub(x11, sp, x11);
5347 __ Sub(x12, sp, x12);
5348
5349 // Check cases where xd.Is(xn). This stresses scratch register allocation.
5350 __ Mov(x20, x30);
5351 __ Mov(x21, x30);
5352 __ Mov(x22, x30);
5353 __ Addvl(x20, x20, 4);
5354 __ Addvl(x21, x21, 42);
5355 __ Addvl(x22, x22, -0x0080000000000000);
5356
5357 END();
5358
5359 if (CAN_RUN()) {
5360 RUN();
5361
5362 uint64_t vl = config->sve_vl_in_bytes();
5363
5364 ASSERT_EQUAL_64(base + (vl * 0), x0);
5365 ASSERT_EQUAL_64(base + (vl * 1), x1);
5366 ASSERT_EQUAL_64(base + (vl * 31), x2);
5367 ASSERT_EQUAL_64(base + (vl * -1), x3);
5368 ASSERT_EQUAL_64(base + (vl * -32), x4);
5369
5370 ASSERT_EQUAL_64(base + (vl * 32), x5);
5371 ASSERT_EQUAL_64(base + (vl * -33), x6);
5372
5373 ASSERT_EQUAL_64(base + (vl * 0x007fffffffffffff), x7);
5374 ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x8);
5375
5376 ASSERT_EQUAL_64(vl * 8, x9);
5377 ASSERT_EQUAL_64(vl * 42, x10);
5378
5379 ASSERT_EQUAL_64(vl * 44, x11);
5380 ASSERT_EQUAL_64(vl * 84, x12);
5381
5382 ASSERT_EQUAL_64(base + (vl * 4), x20);
5383 ASSERT_EQUAL_64(base + (vl * 42), x21);
5384 ASSERT_EQUAL_64(base + (vl * 0xff80000000000000), x22);
5385
5386 ASSERT_EQUAL_64(base, x30);
5387 }
5388}
5389
5390TEST_SVE(sve_addpl) {
5391 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5392 START();
5393
5394 uint64_t base = 0x1234567800000000;
5395 __ Mov(x30, base);
5396
5397 // Encodable multipliers.
5398 __ Addpl(x0, x30, 0);
5399 __ Addpl(x1, x30, 1);
5400 __ Addpl(x2, x30, 31);
5401 __ Addpl(x3, x30, -1);
5402 __ Addpl(x4, x30, -32);
5403
5404 // For unencodable multipliers, the MacroAssembler uses `Addvl` if it can, or
5405 // it falls back to `Rdvl` and `Add`.
5406 __ Addpl(x5, x30, 32);
5407 __ Addpl(x6, x30, -33);
5408
5409 // Test the limits of the multiplier supported by the `Rdvl` macro.
5410 __ Addpl(x7, x30, 0x007fffffffffffff);
5411 __ Addpl(x8, x30, -0x0080000000000000);
5412
5413 // Check that xzr behaves correctly.
5414 __ Addpl(x9, xzr, 8);
5415 __ Addpl(x10, xzr, 42);
5416
5417 // Check that sp behaves correctly with encodable and unencodable multipliers.
5418 __ Addpl(sp, sp, -5);
5419 __ Addpl(sp, sp, -37);
5420 __ Addpl(x11, sp, -2);
5421 __ Addpl(sp, x11, 2);
5422 __ Addpl(x12, sp, -42);
5423
5424 // Restore the value of sp.
5425 __ Addpl(sp, x11, 39);
5426 __ Addpl(sp, sp, 5);
5427
5428 // Adjust x11 and x12 to make the test sp-agnostic.
5429 __ Sub(x11, sp, x11);
5430 __ Sub(x12, sp, x12);
5431
5432 // Check cases where xd.Is(xn). This stresses scratch register allocation.
5433 __ Mov(x20, x30);
5434 __ Mov(x21, x30);
5435 __ Mov(x22, x30);
5436 __ Addpl(x20, x20, 4);
5437 __ Addpl(x21, x21, 42);
5438 __ Addpl(x22, x22, -0x0080000000000000);
5439
5440 END();
5441
5442 if (CAN_RUN()) {
5443 RUN();
5444
5445 uint64_t vl = config->sve_vl_in_bytes();
5446 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5447 uint64_t pl = vl / kZRegBitsPerPRegBit;
5448
5449 ASSERT_EQUAL_64(base + (pl * 0), x0);
5450 ASSERT_EQUAL_64(base + (pl * 1), x1);
5451 ASSERT_EQUAL_64(base + (pl * 31), x2);
5452 ASSERT_EQUAL_64(base + (pl * -1), x3);
5453 ASSERT_EQUAL_64(base + (pl * -32), x4);
5454
5455 ASSERT_EQUAL_64(base + (pl * 32), x5);
5456 ASSERT_EQUAL_64(base + (pl * -33), x6);
5457
5458 ASSERT_EQUAL_64(base + (pl * 0x007fffffffffffff), x7);
5459 ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x8);
5460
5461 ASSERT_EQUAL_64(pl * 8, x9);
5462 ASSERT_EQUAL_64(pl * 42, x10);
5463
5464 ASSERT_EQUAL_64(pl * 44, x11);
5465 ASSERT_EQUAL_64(pl * 84, x12);
5466
5467 ASSERT_EQUAL_64(base + (pl * 4), x20);
5468 ASSERT_EQUAL_64(base + (pl * 42), x21);
5469 ASSERT_EQUAL_64(base + (pl * 0xff80000000000000), x22);
5470
5471 ASSERT_EQUAL_64(base, x30);
5472 }
5473}
5474
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005475TEST_SVE(sve_calculate_sve_address) {
Martyn Capewell6e8db232022-01-07 16:38:14 +00005476#pragma GCC diagnostic push
5477#pragma GCC diagnostic ignored "-Wshadow"
5478
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005479 // Shadow the `MacroAssembler` type so that the test macros work without
5480 // modification.
5481 typedef CalculateSVEAddressMacroAssembler MacroAssembler;
5482
Jacob Bramley1314c462019-08-08 10:54:16 +01005483 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005484 START(); // NOLINT(clang-diagnostic-local-type-template-args)
Jacob Bramley1314c462019-08-08 10:54:16 +01005485
5486 uint64_t base = 0x1234567800000000;
5487 __ Mov(x28, base);
5488 __ Mov(x29, 48);
5489 __ Mov(x30, -48);
5490
5491 // Simple scalar (or equivalent) cases.
5492
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005493 __ CalculateSVEAddress(x0, SVEMemOperand(x28));
5494 __ CalculateSVEAddress(x1, SVEMemOperand(x28, 0));
5495 __ CalculateSVEAddress(x2, SVEMemOperand(x28, 0, SVE_MUL_VL));
5496 __ CalculateSVEAddress(x3, SVEMemOperand(x28, 0, SVE_MUL_VL), 3);
5497 __ CalculateSVEAddress(x4, SVEMemOperand(x28, xzr));
5498 __ CalculateSVEAddress(x5, SVEMemOperand(x28, xzr, LSL, 42));
Jacob Bramley1314c462019-08-08 10:54:16 +01005499
5500 // scalar-plus-immediate
5501
5502 // Unscaled immediates, handled with `Add`.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005503 __ CalculateSVEAddress(x6, SVEMemOperand(x28, 42));
5504 __ CalculateSVEAddress(x7, SVEMemOperand(x28, -42));
Jacob Bramley1314c462019-08-08 10:54:16 +01005505 // Scaled immediates, handled with `Addvl` or `Addpl`.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005506 __ CalculateSVEAddress(x8, SVEMemOperand(x28, 31, SVE_MUL_VL), 0);
5507 __ CalculateSVEAddress(x9, SVEMemOperand(x28, -32, SVE_MUL_VL), 0);
Jacob Bramley1314c462019-08-08 10:54:16 +01005508 // Out of `addvl` or `addpl` range.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005509 __ CalculateSVEAddress(x10, SVEMemOperand(x28, 42, SVE_MUL_VL), 0);
5510 __ CalculateSVEAddress(x11, SVEMemOperand(x28, -42, SVE_MUL_VL), 0);
5511 // As above, for VL-based accesses smaller than a Z register.
5512 VIXL_STATIC_ASSERT(kZRegBitsPerPRegBitLog2 == 3);
5513 __ CalculateSVEAddress(x12, SVEMemOperand(x28, -32 * 8, SVE_MUL_VL), 3);
5514 __ CalculateSVEAddress(x13, SVEMemOperand(x28, -42 * 8, SVE_MUL_VL), 3);
5515 __ CalculateSVEAddress(x14, SVEMemOperand(x28, -32 * 4, SVE_MUL_VL), 2);
5516 __ CalculateSVEAddress(x15, SVEMemOperand(x28, -42 * 4, SVE_MUL_VL), 2);
5517 __ CalculateSVEAddress(x18, SVEMemOperand(x28, -32 * 2, SVE_MUL_VL), 1);
5518 __ CalculateSVEAddress(x19, SVEMemOperand(x28, -42 * 2, SVE_MUL_VL), 1);
Jacob Bramley1314c462019-08-08 10:54:16 +01005519
5520 // scalar-plus-scalar
5521
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005522 __ CalculateSVEAddress(x20, SVEMemOperand(x28, x29));
5523 __ CalculateSVEAddress(x21, SVEMemOperand(x28, x30));
5524 __ CalculateSVEAddress(x22, SVEMemOperand(x28, x29, LSL, 8));
5525 __ CalculateSVEAddress(x23, SVEMemOperand(x28, x30, LSL, 8));
Jacob Bramley1314c462019-08-08 10:54:16 +01005526
5527 // In-place updates, to stress scratch register allocation.
5528
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005529 __ Mov(x24, 0xabcd000000000000);
5530 __ Mov(x25, 0xabcd101100000000);
5531 __ Mov(x26, 0xabcd202200000000);
5532 __ Mov(x27, 0xabcd303300000000);
5533 __ Mov(x28, 0xabcd404400000000);
5534 __ Mov(x29, 0xabcd505500000000);
Jacob Bramley1314c462019-08-08 10:54:16 +01005535
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005536 __ CalculateSVEAddress(x24, SVEMemOperand(x24));
5537 __ CalculateSVEAddress(x25, SVEMemOperand(x25, 0x42));
5538 __ CalculateSVEAddress(x26, SVEMemOperand(x26, 3, SVE_MUL_VL), 0);
5539 __ CalculateSVEAddress(x27, SVEMemOperand(x27, 0x42, SVE_MUL_VL), 3);
5540 __ CalculateSVEAddress(x28, SVEMemOperand(x28, x30));
5541 __ CalculateSVEAddress(x29, SVEMemOperand(x29, x30, LSL, 4));
Jacob Bramley1314c462019-08-08 10:54:16 +01005542
5543 END();
5544
5545 if (CAN_RUN()) {
5546 RUN();
5547
5548 uint64_t vl = config->sve_vl_in_bytes();
5549 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
5550 uint64_t pl = vl / kZRegBitsPerPRegBit;
5551
5552 // Simple scalar (or equivalent) cases.
5553 ASSERT_EQUAL_64(base, x0);
5554 ASSERT_EQUAL_64(base, x1);
5555 ASSERT_EQUAL_64(base, x2);
5556 ASSERT_EQUAL_64(base, x3);
5557 ASSERT_EQUAL_64(base, x4);
5558 ASSERT_EQUAL_64(base, x5);
5559
5560 // scalar-plus-immediate
5561 ASSERT_EQUAL_64(base + 42, x6);
5562 ASSERT_EQUAL_64(base - 42, x7);
5563 ASSERT_EQUAL_64(base + (31 * vl), x8);
5564 ASSERT_EQUAL_64(base - (32 * vl), x9);
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005565 ASSERT_EQUAL_64(base + (42 * vl), x10);
5566 ASSERT_EQUAL_64(base - (42 * vl), x11);
5567 ASSERT_EQUAL_64(base - (32 * vl), x12);
Jacob Bramley1314c462019-08-08 10:54:16 +01005568 ASSERT_EQUAL_64(base - (42 * vl), x13);
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005569 ASSERT_EQUAL_64(base - (32 * vl), x14);
5570 ASSERT_EQUAL_64(base - (42 * vl), x15);
5571 ASSERT_EQUAL_64(base - (32 * vl), x18);
5572 ASSERT_EQUAL_64(base - (42 * vl), x19);
Jacob Bramley1314c462019-08-08 10:54:16 +01005573
5574 // scalar-plus-scalar
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005575 ASSERT_EQUAL_64(base + 48, x20);
5576 ASSERT_EQUAL_64(base - 48, x21);
5577 ASSERT_EQUAL_64(base + (48 << 8), x22);
5578 ASSERT_EQUAL_64(base - (48 << 8), x23);
Jacob Bramley1314c462019-08-08 10:54:16 +01005579
5580 // In-place updates.
Jacob Bramley6ebbba62019-10-09 15:02:10 +01005581 ASSERT_EQUAL_64(0xabcd000000000000, x24);
5582 ASSERT_EQUAL_64(0xabcd101100000000 + 0x42, x25);
5583 ASSERT_EQUAL_64(0xabcd202200000000 + (3 * vl), x26);
5584 ASSERT_EQUAL_64(0xabcd303300000000 + (0x42 * pl), x27);
5585 ASSERT_EQUAL_64(0xabcd404400000000 - 48, x28);
5586 ASSERT_EQUAL_64(0xabcd505500000000 - (48 << 4), x29);
Jacob Bramley1314c462019-08-08 10:54:16 +01005587 }
Martyn Capewell6e8db232022-01-07 16:38:14 +00005588#pragma GCC diagnostic pop
Jacob Bramley1314c462019-08-08 10:54:16 +01005589}
5590
TatWai Chong4f28df72019-08-14 17:50:30 -07005591TEST_SVE(sve_permute_vector_unpredicated) {
5592 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
5593 START();
5594
Jacob Bramleye4983d42019-10-08 10:56:15 +01005595 // Initialise registers with known values first.
5596 __ Dup(z1.VnB(), 0x11);
5597 __ Dup(z2.VnB(), 0x22);
5598 __ Dup(z3.VnB(), 0x33);
5599 __ Dup(z4.VnB(), 0x44);
5600
TatWai Chong4f28df72019-08-14 17:50:30 -07005601 __ Mov(x0, 0x0123456789abcdef);
5602 __ Fmov(d0, RawbitsToDouble(0x7ffaaaaa22223456));
5603 __ Insr(z1.VnS(), w0);
5604 __ Insr(z2.VnD(), x0);
5605 __ Insr(z3.VnH(), h0);
5606 __ Insr(z4.VnD(), d0);
5607
5608 uint64_t inputs[] = {0xfedcba9876543210,
5609 0x0123456789abcdef,
5610 0x8f8e8d8c8b8a8988,
5611 0x8786858483828180};
5612
5613 // Initialize a distinguishable value throughout the register first.
5614 __ Dup(z9.VnB(), 0xff);
5615 InsrHelper(&masm, z9.VnD(), inputs);
5616
5617 __ Rev(z5.VnB(), z9.VnB());
5618 __ Rev(z6.VnH(), z9.VnH());
5619 __ Rev(z7.VnS(), z9.VnS());
5620 __ Rev(z8.VnD(), z9.VnD());
5621
5622 int index[7] = {22, 7, 7, 3, 1, 1, 63};
5623 // Broadcasting an data within the input array.
5624 __ Dup(z10.VnB(), z9.VnB(), index[0]);
5625 __ Dup(z11.VnH(), z9.VnH(), index[1]);
5626 __ Dup(z12.VnS(), z9.VnS(), index[2]);
5627 __ Dup(z13.VnD(), z9.VnD(), index[3]);
5628 __ Dup(z14.VnQ(), z9.VnQ(), index[4]);
5629 // Test dst == src
5630 __ Mov(z15, z9);
5631 __ Dup(z15.VnS(), z15.VnS(), index[5]);
5632 // Selecting an data beyond the input array.
5633 __ Dup(z16.VnB(), z9.VnB(), index[6]);
5634
5635 END();
5636
5637 if (CAN_RUN()) {
5638 RUN();
5639
5640 // Insr
Jacob Bramleye4983d42019-10-08 10:56:15 +01005641 uint64_t z1_expected[] = {0x1111111111111111, 0x1111111189abcdef};
5642 uint64_t z2_expected[] = {0x2222222222222222, 0x0123456789abcdef};
5643 uint64_t z3_expected[] = {0x3333333333333333, 0x3333333333333456};
5644 uint64_t z4_expected[] = {0x4444444444444444, 0x7ffaaaaa22223456};
TatWai Chong4f28df72019-08-14 17:50:30 -07005645 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
5646 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
5647 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
5648 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
5649
5650 // Rev
5651 int lane_count = core.GetSVELaneCount(kBRegSize);
5652 for (int i = 0; i < lane_count; i++) {
5653 uint64_t expected =
5654 core.zreg_lane(z5.GetCode(), kBRegSize, lane_count - i - 1);
5655 uint64_t input = core.zreg_lane(z9.GetCode(), kBRegSize, i);
5656 ASSERT_EQUAL_64(expected, input);
5657 }
5658
5659 lane_count = core.GetSVELaneCount(kHRegSize);
5660 for (int i = 0; i < lane_count; i++) {
5661 uint64_t expected =
5662 core.zreg_lane(z6.GetCode(), kHRegSize, lane_count - i - 1);
5663 uint64_t input = core.zreg_lane(z9.GetCode(), kHRegSize, i);
5664 ASSERT_EQUAL_64(expected, input);
5665 }
5666
5667 lane_count = core.GetSVELaneCount(kSRegSize);
5668 for (int i = 0; i < lane_count; i++) {
5669 uint64_t expected =
5670 core.zreg_lane(z7.GetCode(), kSRegSize, lane_count - i - 1);
5671 uint64_t input = core.zreg_lane(z9.GetCode(), kSRegSize, i);
5672 ASSERT_EQUAL_64(expected, input);
5673 }
5674
5675 lane_count = core.GetSVELaneCount(kDRegSize);
5676 for (int i = 0; i < lane_count; i++) {
5677 uint64_t expected =
5678 core.zreg_lane(z8.GetCode(), kDRegSize, lane_count - i - 1);
5679 uint64_t input = core.zreg_lane(z9.GetCode(), kDRegSize, i);
5680 ASSERT_EQUAL_64(expected, input);
5681 }
5682
5683 // Dup
5684 unsigned vl = config->sve_vl_in_bits();
5685 lane_count = core.GetSVELaneCount(kBRegSize);
5686 uint64_t expected_z10 = (vl > (index[0] * kBRegSize)) ? 0x23 : 0;
5687 for (int i = 0; i < lane_count; i++) {
5688 ASSERT_EQUAL_SVE_LANE(expected_z10, z10.VnB(), i);
5689 }
5690
5691 lane_count = core.GetSVELaneCount(kHRegSize);
5692 uint64_t expected_z11 = (vl > (index[1] * kHRegSize)) ? 0x8f8e : 0;
5693 for (int i = 0; i < lane_count; i++) {
5694 ASSERT_EQUAL_SVE_LANE(expected_z11, z11.VnH(), i);
5695 }
5696
5697 lane_count = core.GetSVELaneCount(kSRegSize);
5698 uint64_t expected_z12 = (vl > (index[2] * kSRegSize)) ? 0xfedcba98 : 0;
5699 for (int i = 0; i < lane_count; i++) {
5700 ASSERT_EQUAL_SVE_LANE(expected_z12, z12.VnS(), i);
5701 }
5702
5703 lane_count = core.GetSVELaneCount(kDRegSize);
5704 uint64_t expected_z13 =
5705 (vl > (index[3] * kDRegSize)) ? 0xfedcba9876543210 : 0;
5706 for (int i = 0; i < lane_count; i++) {
5707 ASSERT_EQUAL_SVE_LANE(expected_z13, z13.VnD(), i);
5708 }
5709
5710 lane_count = core.GetSVELaneCount(kDRegSize);
5711 uint64_t expected_z14_lo = 0;
5712 uint64_t expected_z14_hi = 0;
5713 if (vl > (index[4] * kQRegSize)) {
5714 expected_z14_lo = 0x0123456789abcdef;
5715 expected_z14_hi = 0xfedcba9876543210;
5716 }
5717 for (int i = 0; i < lane_count; i += 2) {
5718 ASSERT_EQUAL_SVE_LANE(expected_z14_lo, z14.VnD(), i);
5719 ASSERT_EQUAL_SVE_LANE(expected_z14_hi, z14.VnD(), i + 1);
5720 }
5721
5722 lane_count = core.GetSVELaneCount(kSRegSize);
5723 uint64_t expected_z15 = (vl > (index[5] * kSRegSize)) ? 0x87868584 : 0;
5724 for (int i = 0; i < lane_count; i++) {
5725 ASSERT_EQUAL_SVE_LANE(expected_z15, z15.VnS(), i);
5726 }
5727
5728 lane_count = core.GetSVELaneCount(kBRegSize);
5729 uint64_t expected_z16 = (vl > (index[6] * kBRegSize)) ? 0xff : 0;
5730 for (int i = 0; i < lane_count; i++) {
5731 ASSERT_EQUAL_SVE_LANE(expected_z16, z16.VnB(), i);
5732 }
5733 }
5734}
5735
Martyn Capewell2e954292020-01-14 14:56:42 +00005736TEST_SVE(sve_permute_vector_unpredicated_unpack_vector_elements) {
TatWai Chong4f28df72019-08-14 17:50:30 -07005737 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5738 START();
5739
5740 uint64_t z9_inputs[] = {0xfedcba9876543210,
5741 0x0123456789abcdef,
5742 0x8f8e8d8c8b8a8988,
5743 0x8786858483828180};
5744 InsrHelper(&masm, z9.VnD(), z9_inputs);
5745
5746 __ Sunpkhi(z10.VnH(), z9.VnB());
5747 __ Sunpkhi(z11.VnS(), z9.VnH());
5748 __ Sunpkhi(z12.VnD(), z9.VnS());
5749
5750 __ Sunpklo(z13.VnH(), z9.VnB());
5751 __ Sunpklo(z14.VnS(), z9.VnH());
5752 __ Sunpklo(z15.VnD(), z9.VnS());
5753
5754 __ Uunpkhi(z16.VnH(), z9.VnB());
5755 __ Uunpkhi(z17.VnS(), z9.VnH());
5756 __ Uunpkhi(z18.VnD(), z9.VnS());
5757
5758 __ Uunpklo(z19.VnH(), z9.VnB());
5759 __ Uunpklo(z20.VnS(), z9.VnH());
5760 __ Uunpklo(z21.VnD(), z9.VnS());
5761
Martyn Capewell2e954292020-01-14 14:56:42 +00005762 // Test unpacking with same source and destination.
5763 __ Mov(z22, z9);
5764 __ Sunpklo(z22.VnH(), z22.VnB());
5765 __ Mov(z23, z9);
5766 __ Uunpklo(z23.VnH(), z23.VnB());
5767
TatWai Chong4f28df72019-08-14 17:50:30 -07005768 END();
5769
5770 if (CAN_RUN()) {
5771 RUN();
5772
5773 // Suunpkhi
5774 int lane_count = core.GetSVELaneCount(kHRegSize);
5775 for (int i = lane_count - 1; i >= 0; i--) {
5776 uint16_t expected = core.zreg_lane<uint16_t>(z10.GetCode(), i);
5777 uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5778 uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5779 ASSERT_EQUAL_64(expected, input);
5780 }
5781
5782 lane_count = core.GetSVELaneCount(kSRegSize);
5783 for (int i = lane_count - 1; i >= 0; i--) {
5784 uint32_t expected = core.zreg_lane<uint32_t>(z11.GetCode(), i);
5785 uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5786 uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5787 ASSERT_EQUAL_64(expected, input);
5788 }
5789
5790 lane_count = core.GetSVELaneCount(kDRegSize);
5791 for (int i = lane_count - 1; i >= 0; i--) {
5792 uint64_t expected = core.zreg_lane<uint64_t>(z12.GetCode(), i);
5793 uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5794 uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5795 ASSERT_EQUAL_64(expected, input);
5796 }
5797
5798 // Suunpklo
5799 lane_count = core.GetSVELaneCount(kHRegSize);
5800 for (int i = lane_count - 1; i >= 0; i--) {
5801 uint16_t expected = core.zreg_lane<uint16_t>(z13.GetCode(), i);
5802 uint8_t b_lane = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5803 uint16_t input = SignExtend<int16_t>(b_lane, kBRegSize);
5804 ASSERT_EQUAL_64(expected, input);
5805 }
5806
5807 lane_count = core.GetSVELaneCount(kSRegSize);
5808 for (int i = lane_count - 1; i >= 0; i--) {
5809 uint32_t expected = core.zreg_lane<uint32_t>(z14.GetCode(), i);
5810 uint16_t h_lane = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5811 uint32_t input = SignExtend<int32_t>(h_lane, kHRegSize);
5812 ASSERT_EQUAL_64(expected, input);
5813 }
5814
5815 lane_count = core.GetSVELaneCount(kDRegSize);
5816 for (int i = lane_count - 1; i >= 0; i--) {
5817 uint64_t expected = core.zreg_lane<uint64_t>(z15.GetCode(), i);
5818 uint32_t s_lane = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5819 uint64_t input = SignExtend<int64_t>(s_lane, kSRegSize);
5820 ASSERT_EQUAL_64(expected, input);
5821 }
5822
5823 // Uuunpkhi
5824 lane_count = core.GetSVELaneCount(kHRegSize);
5825 for (int i = lane_count - 1; i >= 0; i--) {
5826 uint16_t expected = core.zreg_lane<uint16_t>(z16.GetCode(), i);
5827 uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i + lane_count);
5828 ASSERT_EQUAL_64(expected, input);
5829 }
5830
5831 lane_count = core.GetSVELaneCount(kSRegSize);
5832 for (int i = lane_count - 1; i >= 0; i--) {
5833 uint32_t expected = core.zreg_lane<uint32_t>(z17.GetCode(), i);
5834 uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i + lane_count);
5835 ASSERT_EQUAL_64(expected, input);
5836 }
5837
5838 lane_count = core.GetSVELaneCount(kDRegSize);
5839 for (int i = lane_count - 1; i >= 0; i--) {
5840 uint64_t expected = core.zreg_lane<uint64_t>(z18.GetCode(), i);
5841 uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i + lane_count);
5842 ASSERT_EQUAL_64(expected, input);
5843 }
5844
5845 // Uuunpklo
5846 lane_count = core.GetSVELaneCount(kHRegSize);
5847 for (int i = lane_count - 1; i >= 0; i--) {
5848 uint16_t expected = core.zreg_lane<uint16_t>(z19.GetCode(), i);
5849 uint16_t input = core.zreg_lane<uint8_t>(z9.GetCode(), i);
5850 ASSERT_EQUAL_64(expected, input);
5851 }
5852
5853 lane_count = core.GetSVELaneCount(kSRegSize);
5854 for (int i = lane_count - 1; i >= 0; i--) {
5855 uint32_t expected = core.zreg_lane<uint32_t>(z20.GetCode(), i);
5856 uint32_t input = core.zreg_lane<uint16_t>(z9.GetCode(), i);
5857 ASSERT_EQUAL_64(expected, input);
5858 }
5859
5860 lane_count = core.GetSVELaneCount(kDRegSize);
5861 for (int i = lane_count - 1; i >= 0; i--) {
5862 uint64_t expected = core.zreg_lane<uint64_t>(z21.GetCode(), i);
5863 uint64_t input = core.zreg_lane<uint32_t>(z9.GetCode(), i);
5864 ASSERT_EQUAL_64(expected, input);
5865 }
Martyn Capewell2e954292020-01-14 14:56:42 +00005866
5867 ASSERT_EQUAL_SVE(z13, z22);
5868 ASSERT_EQUAL_SVE(z19, z23);
TatWai Chong4f28df72019-08-14 17:50:30 -07005869 }
5870}
5871
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01005872TEST_SVE(sve_cnot_not) {
5873 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5874 START();
5875
5876 uint64_t in[] = {0x0000000000000000, 0x00000000e1c30000, 0x123456789abcdef0};
5877
5878 // For simplicity, we re-use the same pg for various lane sizes.
5879 // For D lanes: 1, 1, 0
5880 // For S lanes: 1, 1, 1, 0, 0
5881 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
5882 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
5883 Initialise(&masm, p0.VnB(), pg_in);
5884 PRegisterM pg = p0.Merging();
5885
5886 // These are merging operations, so we have to initialise the result register.
5887 // We use a mixture of constructive and destructive operations.
5888
5889 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01005890 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01005891 __ Mov(z30, z31);
5892
5893 // For constructive operations, use a different initial result value.
5894 __ Index(z29.VnB(), 0, -1);
5895
5896 __ Mov(z0, z31);
5897 __ Cnot(z0.VnB(), pg, z0.VnB()); // destructive
5898 __ Mov(z1, z29);
5899 __ Cnot(z1.VnH(), pg, z31.VnH());
5900 __ Mov(z2, z31);
5901 __ Cnot(z2.VnS(), pg, z2.VnS()); // destructive
5902 __ Mov(z3, z29);
5903 __ Cnot(z3.VnD(), pg, z31.VnD());
5904
5905 __ Mov(z4, z29);
5906 __ Not(z4.VnB(), pg, z31.VnB());
5907 __ Mov(z5, z31);
5908 __ Not(z5.VnH(), pg, z5.VnH()); // destructive
5909 __ Mov(z6, z29);
5910 __ Not(z6.VnS(), pg, z31.VnS());
5911 __ Mov(z7, z31);
5912 __ Not(z7.VnD(), pg, z7.VnD()); // destructive
5913
5914 END();
5915
5916 if (CAN_RUN()) {
5917 RUN();
5918
5919 // Check that constructive operations preserve their inputs.
5920 ASSERT_EQUAL_SVE(z30, z31);
5921
5922 // clang-format off
5923
5924 // Cnot (B) destructive
5925 uint64_t expected_z0[] =
5926 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
5927 {0x0000000001000101, 0x01000001e1000101, 0x12340078000000f0};
5928 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
5929
5930 // Cnot (H)
5931 uint64_t expected_z1[] =
5932 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
5933 {0xe9eaebecedee0001, 0xf1f2000100000001, 0xf9fafbfc0000ff00};
5934 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
5935
5936 // Cnot (S) destructive
5937 uint64_t expected_z2[] =
5938 // pg: 0 1 1 1 0 0
5939 {0x0000000000000001, 0x0000000100000000, 0x123456789abcdef0};
5940 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
5941
5942 // Cnot (D)
5943 uint64_t expected_z3[] =
5944 // pg: 1 1 0
5945 {0x0000000000000001, 0x0000000000000000, 0xf9fafbfcfdfeff00};
5946 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
5947
5948 // Not (B)
5949 uint64_t expected_z4[] =
5950 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
5951 {0xe9eaebecffeeffff, 0xfff2f3fff53cffff, 0xf9faa9fc65432100};
5952 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
5953
5954 // Not (H) destructive
5955 uint64_t expected_z5[] =
5956 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
5957 {0x000000000000ffff, 0x0000ffff1e3cffff, 0x123456786543def0};
5958 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
5959
5960 // Not (S)
5961 uint64_t expected_z6[] =
5962 // pg: 0 1 1 1 0 0
5963 {0xe9eaebecffffffff, 0xffffffff1e3cffff, 0xf9fafbfcfdfeff00};
5964 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
5965
5966 // Not (D) destructive
5967 uint64_t expected_z7[] =
5968 // pg: 1 1 0
5969 {0xffffffffffffffff, 0xffffffff1e3cffff, 0x123456789abcdef0};
5970 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
5971
5972 // clang-format on
5973 }
5974}
5975
5976TEST_SVE(sve_fabs_fneg) {
5977 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
5978 START();
5979
5980 // Include FP64, FP32 and FP16 signalling NaNs. Most FP operations quieten
5981 // NaNs, but fabs and fneg do not.
5982 uint64_t in[] = {0xc04500004228d140, // Recognisable (+/-42) values.
5983 0xfff00000ff80fc01, // Signalling NaNs.
5984 0x123456789abcdef0};
5985
5986 // For simplicity, we re-use the same pg for various lane sizes.
5987 // For D lanes: 1, 1, 0
5988 // For S lanes: 1, 1, 1, 0, 0
5989 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
5990 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
5991 Initialise(&masm, p0.VnB(), pg_in);
5992 PRegisterM pg = p0.Merging();
5993
5994 // These are merging operations, so we have to initialise the result register.
5995 // We use a mixture of constructive and destructive operations.
5996
5997 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01005998 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01005999 __ Mov(z30, z31);
6000
6001 // For constructive operations, use a different initial result value.
6002 __ Index(z29.VnB(), 0, -1);
6003
6004 __ Mov(z0, z29);
6005 __ Fabs(z0.VnH(), pg, z31.VnH());
6006 __ Mov(z1, z31);
6007 __ Fabs(z1.VnS(), pg, z1.VnS()); // destructive
6008 __ Mov(z2, z29);
6009 __ Fabs(z2.VnD(), pg, z31.VnD());
6010
6011 __ Mov(z3, z31);
6012 __ Fneg(z3.VnH(), pg, z3.VnH()); // destructive
6013 __ Mov(z4, z29);
6014 __ Fneg(z4.VnS(), pg, z31.VnS());
6015 __ Mov(z5, z31);
6016 __ Fneg(z5.VnD(), pg, z5.VnD()); // destructive
6017
6018 END();
6019
6020 if (CAN_RUN()) {
6021 RUN();
6022
6023 // Check that constructive operations preserve their inputs.
6024 ASSERT_EQUAL_SVE(z30, z31);
6025
6026 // clang-format off
6027
6028 // Fabs (H)
6029 uint64_t expected_z0[] =
6030 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6031 {0xe9eaebecedee5140, 0xf1f200007f807c01, 0xf9fafbfc1abcff00};
6032 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6033
6034 // Fabs (S) destructive
6035 uint64_t expected_z1[] =
6036 // pg: 0 1 1 1 0 0
6037 {0xc04500004228d140, 0x7ff000007f80fc01, 0x123456789abcdef0};
6038 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6039
6040 // Fabs (D)
6041 uint64_t expected_z2[] =
6042 // pg: 1 1 0
6043 {0x404500004228d140, 0x7ff00000ff80fc01, 0xf9fafbfcfdfeff00};
6044 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6045
6046 // Fneg (H) destructive
6047 uint64_t expected_z3[] =
6048 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6049 {0xc045000042285140, 0xfff080007f807c01, 0x123456781abcdef0};
6050 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6051
6052 // Fneg (S)
6053 uint64_t expected_z4[] =
6054 // pg: 0 1 1 1 0 0
6055 {0xe9eaebecc228d140, 0x7ff000007f80fc01, 0xf9fafbfcfdfeff00};
6056 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6057
6058 // Fneg (D) destructive
6059 uint64_t expected_z5[] =
6060 // pg: 1 1 0
6061 {0x404500004228d140, 0x7ff00000ff80fc01, 0x123456789abcdef0};
6062 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6063
6064 // clang-format on
6065 }
6066}
6067
6068TEST_SVE(sve_cls_clz_cnt) {
6069 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6070 START();
6071
6072 uint64_t in[] = {0x0000000000000000, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6073
6074 // For simplicity, we re-use the same pg for various lane sizes.
6075 // For D lanes: 1, 1, 0
6076 // For S lanes: 1, 1, 1, 0, 0
6077 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6078 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6079 Initialise(&masm, p0.VnB(), pg_in);
6080 PRegisterM pg = p0.Merging();
6081
6082 // These are merging operations, so we have to initialise the result register.
6083 // We use a mixture of constructive and destructive operations.
6084
6085 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006086 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006087 __ Mov(z30, z31);
6088
6089 // For constructive operations, use a different initial result value.
6090 __ Index(z29.VnB(), 0, -1);
6091
6092 __ Mov(z0, z29);
6093 __ Cls(z0.VnB(), pg, z31.VnB());
6094 __ Mov(z1, z31);
6095 __ Clz(z1.VnH(), pg, z1.VnH()); // destructive
6096 __ Mov(z2, z29);
6097 __ Cnt(z2.VnS(), pg, z31.VnS());
6098 __ Mov(z3, z31);
6099 __ Cnt(z3.VnD(), pg, z3.VnD()); // destructive
6100
6101 END();
6102
6103 if (CAN_RUN()) {
6104 RUN();
6105 // Check that non-destructive operations preserve their inputs.
6106 ASSERT_EQUAL_SVE(z30, z31);
6107
6108 // clang-format off
6109
6110 // cls (B)
6111 uint8_t expected_z0[] =
6112 // pg: 0 0 0 0 1 0 1 1
6113 // pg: 1 0 0 1 0 1 1 1
6114 // pg: 0 0 1 0 1 1 1 0
6115 {0xe9, 0xea, 0xeb, 0xec, 7, 0xee, 7, 7,
6116 6, 0xf2, 0xf3, 3, 0xf5, 1, 0, 3,
6117 0xf9, 0xfa, 0, 0xfc, 0, 0, 1, 0x00};
6118 ASSERT_EQUAL_SVE(expected_z0, z0.VnB());
6119
6120 // clz (H) destructive
6121 uint16_t expected_z1[] =
6122 // pg: 0 0 0 1
6123 // pg: 0 1 1 1
6124 // pg: 0 0 1 0
6125 {0x0000, 0x0000, 0x0000, 16,
6126 0xfefc, 0, 0, 0,
6127 0x1234, 0x5678, 0, 0xdef0};
6128 ASSERT_EQUAL_SVE(expected_z1, z1.VnH());
6129
6130 // cnt (S)
6131 uint32_t expected_z2[] =
6132 // pg: 0 1
6133 // pg: 1 1
6134 // pg: 0 0
6135 {0xe9eaebec, 0,
6136 22, 16,
6137 0xf9fafbfc, 0xfdfeff00};
6138 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
6139
6140 // cnt (D) destructive
6141 uint64_t expected_z3[] =
6142 // pg: 1 1 0
6143 { 0, 38, 0x123456789abcdef0};
6144 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6145
6146 // clang-format on
6147 }
6148}
6149
6150TEST_SVE(sve_sxt) {
6151 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6152 START();
6153
6154 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6155
6156 // For simplicity, we re-use the same pg for various lane sizes.
6157 // For D lanes: 1, 1, 0
6158 // For S lanes: 1, 1, 1, 0, 0
6159 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6160 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6161 Initialise(&masm, p0.VnB(), pg_in);
6162 PRegisterM pg = p0.Merging();
6163
6164 // These are merging operations, so we have to initialise the result register.
6165 // We use a mixture of constructive and destructive operations.
6166
6167 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006168 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006169 __ Mov(z30, z31);
6170
6171 // For constructive operations, use a different initial result value.
6172 __ Index(z29.VnB(), 0, -1);
6173
6174 __ Mov(z0, z31);
6175 __ Sxtb(z0.VnH(), pg, z0.VnH()); // destructive
6176 __ Mov(z1, z29);
6177 __ Sxtb(z1.VnS(), pg, z31.VnS());
6178 __ Mov(z2, z31);
6179 __ Sxtb(z2.VnD(), pg, z2.VnD()); // destructive
6180 __ Mov(z3, z29);
6181 __ Sxth(z3.VnS(), pg, z31.VnS());
6182 __ Mov(z4, z31);
6183 __ Sxth(z4.VnD(), pg, z4.VnD()); // destructive
6184 __ Mov(z5, z29);
6185 __ Sxtw(z5.VnD(), pg, z31.VnD());
6186
6187 END();
6188
6189 if (CAN_RUN()) {
6190 RUN();
6191 // Check that constructive operations preserve their inputs.
6192 ASSERT_EQUAL_SVE(z30, z31);
6193
6194 // clang-format off
6195
6196 // Sxtb (H) destructive
6197 uint64_t expected_z0[] =
6198 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6199 {0x01f203f405f6fff8, 0xfefcfff0ffc3000f, 0x12345678ffbcdef0};
6200 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6201
6202 // Sxtb (S)
6203 uint64_t expected_z1[] =
6204 // pg: 0 1 1 1 0 0
6205 {0xe9eaebecfffffff8, 0xfffffff00000000f, 0xf9fafbfcfdfeff00};
6206 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6207
6208 // Sxtb (D) destructive
6209 uint64_t expected_z2[] =
6210 // pg: 1 1 0
6211 {0xfffffffffffffff8, 0x000000000000000f, 0x123456789abcdef0};
6212 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6213
6214 // Sxth (S)
6215 uint64_t expected_z3[] =
6216 // pg: 0 1 1 1 0 0
6217 {0xe9eaebec000007f8, 0xfffff8f0ffff870f, 0xf9fafbfcfdfeff00};
6218 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6219
6220 // Sxth (D) destructive
6221 uint64_t expected_z4[] =
6222 // pg: 1 1 0
6223 {0x00000000000007f8, 0xffffffffffff870f, 0x123456789abcdef0};
6224 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6225
6226 // Sxtw (D)
6227 uint64_t expected_z5[] =
6228 // pg: 1 1 0
6229 {0x0000000005f607f8, 0xffffffffe1c3870f, 0xf9fafbfcfdfeff00};
6230 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6231
6232 // clang-format on
6233 }
6234}
6235
6236TEST_SVE(sve_uxt) {
6237 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6238 START();
6239
6240 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6241
6242 // For simplicity, we re-use the same pg for various lane sizes.
6243 // For D lanes: 1, 1, 0
6244 // For S lanes: 1, 1, 1, 0, 0
6245 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6246 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6247 Initialise(&masm, p0.VnB(), pg_in);
6248 PRegisterM pg = p0.Merging();
6249
6250 // These are merging operations, so we have to initialise the result register.
6251 // We use a mixture of constructive and destructive operations.
6252
6253 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006254 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006255 __ Mov(z30, z31);
6256
6257 // For constructive operations, use a different initial result value.
6258 __ Index(z29.VnB(), 0, -1);
6259
6260 __ Mov(z0, z29);
6261 __ Uxtb(z0.VnH(), pg, z31.VnH());
6262 __ Mov(z1, z31);
6263 __ Uxtb(z1.VnS(), pg, z1.VnS()); // destructive
6264 __ Mov(z2, z29);
6265 __ Uxtb(z2.VnD(), pg, z31.VnD());
6266 __ Mov(z3, z31);
6267 __ Uxth(z3.VnS(), pg, z3.VnS()); // destructive
6268 __ Mov(z4, z29);
6269 __ Uxth(z4.VnD(), pg, z31.VnD());
6270 __ Mov(z5, z31);
6271 __ Uxtw(z5.VnD(), pg, z5.VnD()); // destructive
6272
6273 END();
6274
6275 if (CAN_RUN()) {
6276 RUN();
6277 // clang-format off
6278
6279 // Uxtb (H)
6280 uint64_t expected_z0[] =
6281 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6282 {0xe9eaebecedee00f8, 0xf1f200f000c3000f, 0xf9fafbfc00bcff00};
6283 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6284
6285 // Uxtb (S) destructive
6286 uint64_t expected_z1[] =
6287 // pg: 0 1 1 1 0 0
6288 {0x01f203f4000000f8, 0x000000f00000000f, 0x123456789abcdef0};
6289 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6290
6291 // Uxtb (D)
6292 uint64_t expected_z2[] =
6293 // pg: 1 1 0
6294 {0x00000000000000f8, 0x000000000000000f, 0xf9fafbfcfdfeff00};
6295 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6296
6297 // Uxth (S) destructive
6298 uint64_t expected_z3[] =
6299 // pg: 0 1 1 1 0 0
6300 {0x01f203f4000007f8, 0x0000f8f00000870f, 0x123456789abcdef0};
6301 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6302
6303 // Uxth (D)
6304 uint64_t expected_z4[] =
6305 // pg: 1 1 0
6306 {0x00000000000007f8, 0x000000000000870f, 0xf9fafbfcfdfeff00};
6307 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6308
6309 // Uxtw (D) destructive
6310 uint64_t expected_z5[] =
6311 // pg: 1 1 0
6312 {0x0000000005f607f8, 0x00000000e1c3870f, 0x123456789abcdef0};
6313 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6314
6315 // clang-format on
6316 }
6317}
6318
6319TEST_SVE(sve_abs_neg) {
6320 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6321 START();
6322
6323 uint64_t in[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
6324
6325 // For simplicity, we re-use the same pg for various lane sizes.
6326 // For D lanes: 1, 1, 0
6327 // For S lanes: 1, 1, 1, 0, 0
6328 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
6329 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
6330 Initialise(&masm, p0.VnB(), pg_in);
6331 PRegisterM pg = p0.Merging();
6332
6333 InsrHelper(&masm, z31.VnD(), in);
6334
6335 // These are merging operations, so we have to initialise the result register.
6336 // We use a mixture of constructive and destructive operations.
6337
6338 InsrHelper(&masm, z31.VnD(), in);
TatWai Chong6995bfd2019-09-26 10:48:05 +01006339 // Make a copy so we can check that constructive operations preserve zn.
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006340 __ Mov(z30, z31);
6341
6342 // For constructive operations, use a different initial result value.
6343 __ Index(z29.VnB(), 0, -1);
6344
6345 __ Mov(z0, z31);
6346 __ Abs(z0.VnD(), pg, z0.VnD()); // destructive
6347 __ Mov(z1, z29);
6348 __ Abs(z1.VnB(), pg, z31.VnB());
6349
6350 __ Mov(z2, z31);
6351 __ Neg(z2.VnH(), pg, z2.VnH()); // destructive
6352 __ Mov(z3, z29);
6353 __ Neg(z3.VnS(), pg, z31.VnS());
6354
Jacob Bramleyc0066272019-09-30 16:30:47 +01006355 // The unpredicated form of `Neg` is implemented using `subr`.
6356 __ Mov(z4, z31);
6357 __ Neg(z4.VnB(), z4.VnB()); // destructive
6358 __ Mov(z5, z29);
6359 __ Neg(z5.VnD(), z31.VnD());
6360
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006361 END();
6362
6363 if (CAN_RUN()) {
6364 RUN();
Jacob Bramleyc0066272019-09-30 16:30:47 +01006365
6366 ASSERT_EQUAL_SVE(z30, z31);
6367
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006368 // clang-format off
6369
6370 // Abs (D) destructive
6371 uint64_t expected_z0[] =
6372 // pg: 1 1 0
6373 {0x01f203f405f607f8, 0x0103070f1e3c78f1, 0x123456789abcdef0};
6374 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6375
6376 // Abs (B)
6377 uint64_t expected_z1[] =
6378 // pg: 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 0
6379 {0xe9eaebec05ee0708, 0x02f2f310f53d790f, 0xf9fa56fc66442200};
6380 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6381
6382 // Neg (H) destructive
6383 uint64_t expected_z2[] =
6384 // pg: 0 0 0 1 0 1 1 1 0 0 1 0
6385 {0x01f203f405f6f808, 0xfefc07101e3d78f1, 0x123456786544def0};
6386 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6387
6388 // Neg (S)
6389 uint64_t expected_z3[] =
6390 // pg: 0 1 1 1 0 0
6391 {0xe9eaebecfa09f808, 0x010307101e3c78f1, 0xf9fafbfcfdfeff00};
6392 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6393
Jacob Bramleyc0066272019-09-30 16:30:47 +01006394 // Neg (B) destructive, unpredicated
6395 uint64_t expected_z4[] =
6396 {0xff0efd0cfb0af908, 0x020408101f3d79f1, 0xeeccaa8866442210};
6397 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6398
6399 // Neg (D) unpredicated
6400 uint64_t expected_z5[] =
6401 {0xfe0dfc0bfa09f808, 0x0103070f1e3c78f1, 0xedcba98765432110};
6402 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6403
Jacob Bramleybc21a0d2019-09-20 18:49:15 +01006404 // clang-format on
6405 }
6406}
6407
Jacob Bramley0093bb92019-10-04 15:54:10 +01006408TEST_SVE(sve_cpy) {
6409 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
6410 START();
6411
6412 // For simplicity, we re-use the same pg for various lane sizes.
6413 // For D lanes: 0, 1, 1
6414 // For S lanes: 0, 1, 1, 0, 1
6415 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6416 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6417
6418 PRegisterM pg = p7.Merging();
6419 Initialise(&masm, pg.VnB(), pg_in);
6420
6421 // These are merging operations, so we have to initialise the result registers
6422 // for each operation.
6423 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6424 __ Index(ZRegister(i, kBRegSize), 0, -1);
6425 }
6426
6427 // Recognisable values to copy.
6428 __ Mov(x0, 0xdeadbeefdeadbe42);
6429 __ Mov(x1, 0xdeadbeefdead8421);
6430 __ Mov(x2, 0xdeadbeef80042001);
6431 __ Mov(x3, 0x8000000420000001);
6432
6433 // Use NEON moves, to avoid testing SVE `cpy` against itself.
6434 __ Dup(v28.V2D(), x0);
6435 __ Dup(v29.V2D(), x1);
6436 __ Dup(v30.V2D(), x2);
6437 __ Dup(v31.V2D(), x3);
6438
6439 // Register forms (CPY_z_p_r)
6440 __ Cpy(z0.VnB(), pg, w0);
6441 __ Cpy(z1.VnH(), pg, x1); // X registers are accepted for small lanes.
6442 __ Cpy(z2.VnS(), pg, w2);
6443 __ Cpy(z3.VnD(), pg, x3);
6444
6445 // VRegister forms (CPY_z_p_v)
6446 __ Cpy(z4.VnB(), pg, b28);
6447 __ Cpy(z5.VnH(), pg, h29);
6448 __ Cpy(z6.VnS(), pg, s30);
6449 __ Cpy(z7.VnD(), pg, d31);
6450
6451 // Check that we can copy the stack pointer.
6452 __ Mov(x10, sp);
6453 __ Mov(sp, 0xabcabcabcabcabca); // Set sp to a known value.
6454 __ Cpy(z16.VnB(), pg, sp);
6455 __ Cpy(z17.VnH(), pg, wsp);
6456 __ Cpy(z18.VnS(), pg, wsp);
6457 __ Cpy(z19.VnD(), pg, sp);
6458 __ Mov(sp, x10); // Restore sp.
6459
6460 END();
6461
6462 if (CAN_RUN()) {
6463 RUN();
6464 // clang-format off
6465
6466 uint64_t expected_b[] =
6467 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6468 {0xe9eaebec424242f0, 0x42f2f34242f64242, 0xf942fbfcfdfeff42};
6469 ASSERT_EQUAL_SVE(expected_b, z0.VnD());
6470 ASSERT_EQUAL_SVE(expected_b, z4.VnD());
6471
6472 uint64_t expected_h[] =
6473 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6474 {0xe9eaebec8421eff0, 0xf1f28421f5f68421, 0x8421fbfcfdfe8421};
6475 ASSERT_EQUAL_SVE(expected_h, z1.VnD());
6476 ASSERT_EQUAL_SVE(expected_h, z5.VnD());
6477
6478 uint64_t expected_s[] =
6479 // pg: 0 0 1 1 0 1
6480 {0xe9eaebecedeeeff0, 0x8004200180042001, 0xf9fafbfc80042001};
6481 ASSERT_EQUAL_SVE(expected_s, z2.VnD());
6482 ASSERT_EQUAL_SVE(expected_s, z6.VnD());
6483
6484 uint64_t expected_d[] =
6485 // pg: 0 1 1
6486 {0xe9eaebecedeeeff0, 0x8000000420000001, 0x8000000420000001};
6487 ASSERT_EQUAL_SVE(expected_d, z3.VnD());
6488 ASSERT_EQUAL_SVE(expected_d, z7.VnD());
6489
6490
6491 uint64_t expected_b_sp[] =
6492 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6493 {0xe9eaebeccacacaf0, 0xcaf2f3cacaf6caca, 0xf9cafbfcfdfeffca};
6494 ASSERT_EQUAL_SVE(expected_b_sp, z16.VnD());
6495
6496 uint64_t expected_h_sp[] =
6497 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6498 {0xe9eaebecabcaeff0, 0xf1f2abcaf5f6abca, 0xabcafbfcfdfeabca};
6499 ASSERT_EQUAL_SVE(expected_h_sp, z17.VnD());
6500
6501 uint64_t expected_s_sp[] =
6502 // pg: 0 0 1 1 0 1
6503 {0xe9eaebecedeeeff0, 0xcabcabcacabcabca, 0xf9fafbfccabcabca};
6504 ASSERT_EQUAL_SVE(expected_s_sp, z18.VnD());
6505
6506 uint64_t expected_d_sp[] =
6507 // pg: 0 1 1
6508 {0xe9eaebecedeeeff0, 0xabcabcabcabcabca, 0xabcabcabcabcabca};
6509 ASSERT_EQUAL_SVE(expected_d_sp, z19.VnD());
6510
6511 // clang-format on
6512 }
6513}
6514
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006515TEST_SVE(sve_cpy_imm) {
6516 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6517 START();
6518
6519 // For simplicity, we re-use the same pg for various lane sizes.
6520 // For D lanes: 0, 1, 1
6521 // For S lanes: 0, 1, 1, 0, 1
6522 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6523 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6524
6525 PRegister pg = p7;
6526 Initialise(&masm, pg.VnB(), pg_in);
6527
6528 // These are (mostly) merging operations, so we have to initialise the result
6529 // registers for each operation.
6530 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6531 __ Index(ZRegister(i, kBRegSize), 0, -1);
6532 }
6533
6534 // Encodable integer forms (CPY_z_p_i)
6535 __ Cpy(z0.VnB(), pg.Merging(), 0);
6536 __ Cpy(z1.VnB(), pg.Zeroing(), 42);
6537 __ Cpy(z2.VnB(), pg.Merging(), -42);
6538 __ Cpy(z3.VnB(), pg.Zeroing(), 0xff);
6539 __ Cpy(z4.VnH(), pg.Merging(), 127);
6540 __ Cpy(z5.VnS(), pg.Zeroing(), -128);
6541 __ Cpy(z6.VnD(), pg.Merging(), -1);
6542
6543 // Forms encodable using fcpy.
6544 __ Cpy(z7.VnH(), pg.Merging(), Float16ToRawbits(Float16(-31.0)));
6545 __ Cpy(z8.VnS(), pg.Zeroing(), FloatToRawbits(2.0f));
6546 __ Cpy(z9.VnD(), pg.Merging(), DoubleToRawbits(-4.0));
6547
6548 // Other forms use a scratch register.
6549 __ Cpy(z10.VnH(), pg.Merging(), 0xff);
6550 __ Cpy(z11.VnD(), pg.Zeroing(), 0x0123456789abcdef);
6551
6552 END();
6553
6554 if (CAN_RUN()) {
6555 RUN();
6556 // clang-format off
6557
6558 uint64_t expected_z0[] =
6559 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6560 {0xe9eaebec000000f0, 0x00f2f30000f60000, 0xf900fbfcfdfeff00};
6561 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
6562
6563 uint64_t expected_z1[] =
6564 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6565 {0x000000002a2a2a00, 0x2a00002a2a002a2a, 0x002a00000000002a};
6566 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6567
6568 uint64_t expected_z2[] =
6569 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6570 {0xe9eaebecd6d6d6f0, 0xd6f2f3d6d6f6d6d6, 0xf9d6fbfcfdfeffd6};
6571 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6572
6573 uint64_t expected_z3[] =
6574 // pg: 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1
6575 {0x00000000ffffff00, 0xff0000ffff00ffff, 0x00ff0000000000ff};
6576 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6577
6578 uint64_t expected_z4[] =
6579 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6580 {0xe9eaebec007feff0, 0xf1f2007ff5f6007f, 0x007ffbfcfdfe007f};
6581 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6582
6583 uint64_t expected_z5[] =
6584 // pg: 0 0 1 1 0 1
6585 {0x0000000000000000, 0xffffff80ffffff80, 0x00000000ffffff80};
6586 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6587
6588 uint64_t expected_z6[] =
6589 // pg: 0 1 1
6590 {0xe9eaebecedeeeff0, 0xffffffffffffffff, 0xffffffffffffffff};
6591 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6592
6593 uint64_t expected_z7[] =
6594 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6595 {0xe9eaebeccfc0eff0, 0xf1f2cfc0f5f6cfc0, 0xcfc0fbfcfdfecfc0};
6596 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6597
6598 uint64_t expected_z8[] =
6599 // pg: 0 0 1 1 0 1
6600 {0x0000000000000000, 0x4000000040000000, 0x0000000040000000};
6601 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6602
6603 uint64_t expected_z9[] =
6604 // pg: 0 1 1
6605 {0xe9eaebecedeeeff0, 0xc010000000000000, 0xc010000000000000};
6606 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6607
6608 uint64_t expected_z10[] =
6609 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6610 {0xe9eaebec00ffeff0, 0xf1f200fff5f600ff, 0x00fffbfcfdfe00ff};
6611 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6612
6613 uint64_t expected_z11[] =
6614 // pg: 0 1 1
6615 {0x0000000000000000, 0x0123456789abcdef, 0x0123456789abcdef};
6616 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6617
6618 // clang-format on
6619 }
6620}
6621
6622TEST_SVE(sve_fcpy_imm) {
6623 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6624 START();
6625
6626 // For simplicity, we re-use the same pg for various lane sizes.
6627 // For D lanes: 0, 1, 1
6628 // For S lanes: 0, 1, 1, 0, 1
6629 // For H lanes: 1, 0, 0, 1, 0, 1, 1, 0, 0, 1
6630 int pg_in[] = {1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1};
6631
6632 PRegister pg = p7;
6633 Initialise(&masm, pg.VnB(), pg_in);
6634
6635 // These are (mostly) merging operations, so we have to initialise the result
6636 // registers for each operation.
6637 for (unsigned i = 0; i < kNumberOfZRegisters; i++) {
6638 __ Index(ZRegister(i, kBRegSize), 0, -1);
6639 }
6640
6641 // Encodable floating-point forms (FCPY_z_p_i)
6642 __ Fcpy(z1.VnH(), pg.Merging(), Float16(1.0));
6643 __ Fcpy(z2.VnH(), pg.Merging(), -2.0f);
6644 __ Fcpy(z3.VnH(), pg.Merging(), 3.0);
6645 __ Fcpy(z4.VnS(), pg.Merging(), Float16(-4.0));
6646 __ Fcpy(z5.VnS(), pg.Merging(), 5.0f);
6647 __ Fcpy(z6.VnS(), pg.Merging(), 6.0);
6648 __ Fcpy(z7.VnD(), pg.Merging(), Float16(7.0));
6649 __ Fcpy(z8.VnD(), pg.Merging(), 8.0f);
Martyn Capewell7db82102020-06-02 16:40:09 +01006650 __ Fmov(z9.VnD(), pg.Merging(), -9.0);
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006651
6652 // Unencodable immediates.
6653 __ Fcpy(z10.VnS(), pg.Merging(), 0.0);
6654 __ Fcpy(z11.VnH(), pg.Merging(), Float16(42.0));
6655 __ Fcpy(z12.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
6656 __ Fcpy(z13.VnH(), pg.Merging(), kFP64NegativeInfinity);
6657
Martyn Capewell7db82102020-06-02 16:40:09 +01006658 // Fmov alias.
6659 __ Fmov(z14.VnS(), pg.Merging(), 0.0);
6660 __ Fmov(z15.VnH(), pg.Merging(), Float16(42.0));
6661 __ Fmov(z16.VnD(), pg.Merging(), RawbitsToDouble(0x7ff0000012340000)); // NaN
6662 __ Fmov(z17.VnH(), pg.Merging(), kFP64NegativeInfinity);
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006663 END();
6664
6665 if (CAN_RUN()) {
6666 RUN();
6667 // clang-format off
6668
6669 // 1.0 as FP16: 0x3c00
6670 uint64_t expected_z1[] =
6671 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6672 {0xe9eaebec3c00eff0, 0xf1f23c00f5f63c00, 0x3c00fbfcfdfe3c00};
6673 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
6674
6675 // -2.0 as FP16: 0xc000
6676 uint64_t expected_z2[] =
6677 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6678 {0xe9eaebecc000eff0, 0xf1f2c000f5f6c000, 0xc000fbfcfdfec000};
6679 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
6680
6681 // 3.0 as FP16: 0x4200
6682 uint64_t expected_z3[] =
6683 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6684 {0xe9eaebec4200eff0, 0xf1f24200f5f64200, 0x4200fbfcfdfe4200};
6685 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
6686
6687 // -4.0 as FP32: 0xc0800000
6688 uint64_t expected_z4[] =
6689 // pg: 0 0 1 1 0 1
6690 {0xe9eaebecedeeeff0, 0xc0800000c0800000, 0xf9fafbfcc0800000};
6691 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
6692
6693 // 5.0 as FP32: 0x40a00000
6694 uint64_t expected_z5[] =
6695 // pg: 0 0 1 1 0 1
6696 {0xe9eaebecedeeeff0, 0x40a0000040a00000, 0xf9fafbfc40a00000};
6697 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
6698
6699 // 6.0 as FP32: 0x40c00000
6700 uint64_t expected_z6[] =
6701 // pg: 0 0 1 1 0 1
6702 {0xe9eaebecedeeeff0, 0x40c0000040c00000, 0xf9fafbfc40c00000};
6703 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
6704
6705 // 7.0 as FP64: 0x401c000000000000
6706 uint64_t expected_z7[] =
6707 // pg: 0 1 1
6708 {0xe9eaebecedeeeff0, 0x401c000000000000, 0x401c000000000000};
6709 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
6710
6711 // 8.0 as FP64: 0x4020000000000000
6712 uint64_t expected_z8[] =
6713 // pg: 0 1 1
6714 {0xe9eaebecedeeeff0, 0x4020000000000000, 0x4020000000000000};
6715 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
6716
6717 // -9.0 as FP64: 0xc022000000000000
6718 uint64_t expected_z9[] =
6719 // pg: 0 1 1
6720 {0xe9eaebecedeeeff0, 0xc022000000000000, 0xc022000000000000};
6721 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
6722
6723 // 0.0 as FP32: 0x00000000
6724 uint64_t expected_z10[] =
6725 // pg: 0 0 1 1 0 1
6726 {0xe9eaebecedeeeff0, 0x0000000000000000, 0xf9fafbfc00000000};
6727 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
6728
6729 // 42.0 as FP16: 0x5140
6730 uint64_t expected_z11[] =
6731 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6732 {0xe9eaebec5140eff0, 0xf1f25140f5f65140, 0x5140fbfcfdfe5140};
6733 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
6734
6735 // Signalling NaN (with payload): 0x7ff0000012340000
6736 uint64_t expected_z12[] =
6737 // pg: 0 1 1
6738 {0xe9eaebecedeeeff0, 0x7ff0000012340000, 0x7ff0000012340000};
6739 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
6740
6741 // -infinity as FP16: 0xfc00
6742 uint64_t expected_z13[] =
6743 // pg: 0 0 1 0 0 1 0 1 1 0 0 1
6744 {0xe9eaebecfc00eff0, 0xf1f2fc00f5f6fc00, 0xfc00fbfcfdfefc00};
6745 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
6746
Martyn Capewell7db82102020-06-02 16:40:09 +01006747 ASSERT_EQUAL_SVE(z10.VnD(), z14.VnD());
6748 ASSERT_EQUAL_SVE(z11.VnD(), z15.VnD());
6749 ASSERT_EQUAL_SVE(z12.VnD(), z16.VnD());
6750 ASSERT_EQUAL_SVE(z13.VnD(), z17.VnD());
Jacob Bramley0f62eab2019-10-23 17:07:47 +01006751 // clang-format on
6752 }
6753}
6754
TatWai Chong4f28df72019-08-14 17:50:30 -07006755TEST_SVE(sve_permute_vector_unpredicated_table_lookup) {
6756 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6757 START();
6758
6759 uint64_t table_inputs[] = {0xffeeddccbbaa9988, 0x7766554433221100};
6760
6761 int index_b[] = {255, 255, 11, 10, 15, 14, 13, 12, 1, 0, 4, 3, 7, 6, 5, 4};
6762
6763 int index_h[] = {5, 6, 7, 8, 2, 3, 6, 4};
6764
6765 int index_s[] = {1, 3, 2, 31, -1};
6766
6767 int index_d[] = {31, 1};
6768
6769 // Initialize the register with a value that doesn't existed in the table.
6770 __ Dup(z9.VnB(), 0x1f);
6771 InsrHelper(&masm, z9.VnD(), table_inputs);
6772
6773 ZRegister ind_b = z0.WithLaneSize(kBRegSize);
6774 ZRegister ind_h = z1.WithLaneSize(kHRegSize);
6775 ZRegister ind_s = z2.WithLaneSize(kSRegSize);
6776 ZRegister ind_d = z3.WithLaneSize(kDRegSize);
6777
6778 InsrHelper(&masm, ind_b, index_b);
6779 InsrHelper(&masm, ind_h, index_h);
6780 InsrHelper(&masm, ind_s, index_s);
6781 InsrHelper(&masm, ind_d, index_d);
6782
6783 __ Tbl(z26.VnB(), z9.VnB(), ind_b);
6784
6785 __ Tbl(z27.VnH(), z9.VnH(), ind_h);
6786
6787 __ Tbl(z28.VnS(), z9.VnS(), ind_s);
6788
6789 __ Tbl(z29.VnD(), z9.VnD(), ind_d);
6790
6791 END();
6792
6793 if (CAN_RUN()) {
6794 RUN();
6795
6796 // clang-format off
6797 unsigned z26_expected[] = {0x1f, 0x1f, 0xbb, 0xaa, 0xff, 0xee, 0xdd, 0xcc,
6798 0x11, 0x00, 0x44, 0x33, 0x77, 0x66, 0x55, 0x44};
6799
6800 unsigned z27_expected[] = {0xbbaa, 0xddcc, 0xffee, 0x1f1f,
6801 0x5544, 0x7766, 0xddcc, 0x9988};
6802
6803 unsigned z28_expected[] =
6804 {0x77665544, 0xffeeddcc, 0xbbaa9988, 0x1f1f1f1f, 0x1f1f1f1f};
6805
6806 uint64_t z29_expected[] = {0x1f1f1f1f1f1f1f1f, 0xffeeddccbbaa9988};
6807 // clang-format on
6808
6809 unsigned vl = config->sve_vl_in_bits();
6810 for (size_t i = 0; i < ArrayLength(index_b); i++) {
6811 int lane = static_cast<int>(ArrayLength(index_b) - i - 1);
6812 if (!core.HasSVELane(z26.VnB(), lane)) break;
6813 uint64_t expected = (vl > (index_b[i] * kBRegSize)) ? z26_expected[i] : 0;
6814 ASSERT_EQUAL_SVE_LANE(expected, z26.VnB(), lane);
6815 }
6816
6817 for (size_t i = 0; i < ArrayLength(index_h); i++) {
6818 int lane = static_cast<int>(ArrayLength(index_h) - i - 1);
6819 if (!core.HasSVELane(z27.VnH(), lane)) break;
6820 uint64_t expected = (vl > (index_h[i] * kHRegSize)) ? z27_expected[i] : 0;
6821 ASSERT_EQUAL_SVE_LANE(expected, z27.VnH(), lane);
6822 }
6823
6824 for (size_t i = 0; i < ArrayLength(index_s); i++) {
6825 int lane = static_cast<int>(ArrayLength(index_s) - i - 1);
6826 if (!core.HasSVELane(z28.VnS(), lane)) break;
6827 uint64_t expected = (vl > (index_s[i] * kSRegSize)) ? z28_expected[i] : 0;
6828 ASSERT_EQUAL_SVE_LANE(expected, z28.VnS(), lane);
6829 }
6830
6831 for (size_t i = 0; i < ArrayLength(index_d); i++) {
6832 int lane = static_cast<int>(ArrayLength(index_d) - i - 1);
6833 if (!core.HasSVELane(z29.VnD(), lane)) break;
6834 uint64_t expected = (vl > (index_d[i] * kDRegSize)) ? z29_expected[i] : 0;
6835 ASSERT_EQUAL_SVE_LANE(expected, z29.VnD(), lane);
6836 }
6837 }
6838}
6839
Jacob Bramley199339d2019-08-05 18:49:13 +01006840TEST_SVE(ldr_str_z_bi) {
6841 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6842 START();
6843
6844 int vl = config->sve_vl_in_bytes();
6845
6846 // The immediate can address [-256, 255] times the VL, so allocate enough
6847 // space to exceed that in both directions.
6848 int data_size = vl * 1024;
6849
6850 uint8_t* data = new uint8_t[data_size];
6851 memset(data, 0, data_size);
6852
6853 // Set the base half-way through the buffer so we can use negative indices.
6854 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6855
6856 __ Index(z1.VnB(), 1, 3);
6857 __ Index(z2.VnB(), 2, 5);
6858 __ Index(z3.VnB(), 3, 7);
6859 __ Index(z4.VnB(), 4, 11);
6860 __ Index(z5.VnB(), 5, 13);
6861 __ Index(z6.VnB(), 6, 2);
6862 __ Index(z7.VnB(), 7, 3);
6863 __ Index(z8.VnB(), 8, 5);
6864 __ Index(z9.VnB(), 9, 7);
6865
6866 // Encodable cases.
6867 __ Str(z1, SVEMemOperand(x0));
6868 __ Str(z2, SVEMemOperand(x0, 2, SVE_MUL_VL));
6869 __ Str(z3, SVEMemOperand(x0, -3, SVE_MUL_VL));
6870 __ Str(z4, SVEMemOperand(x0, 255, SVE_MUL_VL));
6871 __ Str(z5, SVEMemOperand(x0, -256, SVE_MUL_VL));
6872
Jacob Bramley6ebbba62019-10-09 15:02:10 +01006873 // Cases that fall back on `CalculateSVEAddress`.
Jacob Bramley199339d2019-08-05 18:49:13 +01006874 __ Str(z6, SVEMemOperand(x0, 6 * vl));
6875 __ Str(z7, SVEMemOperand(x0, -7 * vl));
6876 __ Str(z8, SVEMemOperand(x0, 314, SVE_MUL_VL));
6877 __ Str(z9, SVEMemOperand(x0, -314, SVE_MUL_VL));
6878
6879 // Corresponding loads.
6880 __ Ldr(z11, SVEMemOperand(x0, xzr)); // Test xzr operand.
6881 __ Ldr(z12, SVEMemOperand(x0, 2, SVE_MUL_VL));
6882 __ Ldr(z13, SVEMemOperand(x0, -3, SVE_MUL_VL));
6883 __ Ldr(z14, SVEMemOperand(x0, 255, SVE_MUL_VL));
6884 __ Ldr(z15, SVEMemOperand(x0, -256, SVE_MUL_VL));
6885
6886 __ Ldr(z16, SVEMemOperand(x0, 6 * vl));
6887 __ Ldr(z17, SVEMemOperand(x0, -7 * vl));
6888 __ Ldr(z18, SVEMemOperand(x0, 314, SVE_MUL_VL));
6889 __ Ldr(z19, SVEMemOperand(x0, -314, SVE_MUL_VL));
6890
6891 END();
6892
6893 if (CAN_RUN()) {
6894 RUN();
6895
6896 uint8_t* expected = new uint8_t[data_size];
6897 memset(expected, 0, data_size);
6898 uint8_t* middle = &expected[data_size / 2];
6899
6900 for (int i = 0; i < vl; i++) {
6901 middle[i] = (1 + (3 * i)) & 0xff; // z1
6902 middle[(2 * vl) + i] = (2 + (5 * i)) & 0xff; // z2
6903 middle[(-3 * vl) + i] = (3 + (7 * i)) & 0xff; // z3
6904 middle[(255 * vl) + i] = (4 + (11 * i)) & 0xff; // z4
6905 middle[(-256 * vl) + i] = (5 + (13 * i)) & 0xff; // z5
6906 middle[(6 * vl) + i] = (6 + (2 * i)) & 0xff; // z6
6907 middle[(-7 * vl) + i] = (7 + (3 * i)) & 0xff; // z7
6908 middle[(314 * vl) + i] = (8 + (5 * i)) & 0xff; // z8
6909 middle[(-314 * vl) + i] = (9 + (7 * i)) & 0xff; // z9
6910 }
6911
Jacob Bramley33c99f92019-10-08 15:24:12 +01006912 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
Jacob Bramley199339d2019-08-05 18:49:13 +01006913
6914 ASSERT_EQUAL_SVE(z1, z11);
6915 ASSERT_EQUAL_SVE(z2, z12);
6916 ASSERT_EQUAL_SVE(z3, z13);
6917 ASSERT_EQUAL_SVE(z4, z14);
6918 ASSERT_EQUAL_SVE(z5, z15);
6919 ASSERT_EQUAL_SVE(z6, z16);
6920 ASSERT_EQUAL_SVE(z7, z17);
6921 ASSERT_EQUAL_SVE(z8, z18);
6922 ASSERT_EQUAL_SVE(z9, z19);
6923
6924 delete[] expected;
6925 }
6926 delete[] data;
6927}
6928
6929TEST_SVE(ldr_str_p_bi) {
6930 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
6931 START();
6932
6933 int vl = config->sve_vl_in_bytes();
6934 VIXL_ASSERT((vl % kZRegBitsPerPRegBit) == 0);
6935 int pl = vl / kZRegBitsPerPRegBit;
6936
6937 // The immediate can address [-256, 255] times the PL, so allocate enough
6938 // space to exceed that in both directions.
6939 int data_size = pl * 1024;
6940
6941 uint8_t* data = new uint8_t[data_size];
6942 memset(data, 0, data_size);
6943
6944 // Set the base half-way through the buffer so we can use negative indices.
6945 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
6946
6947 uint64_t pattern[4] = {0x1010101011101111,
6948 0x0010111011000101,
6949 0x1001101110010110,
6950 0x1010110101100011};
6951 for (int i = 8; i <= 15; i++) {
6952 // Initialise p8-p15 with a conveniently-recognisable, non-zero pattern.
6953 Initialise(&masm,
6954 PRegister(i),
6955 pattern[3] * i,
6956 pattern[2] * i,
6957 pattern[1] * i,
6958 pattern[0] * i);
6959 }
6960
6961 // Encodable cases.
6962 __ Str(p8, SVEMemOperand(x0));
6963 __ Str(p9, SVEMemOperand(x0, 2, SVE_MUL_VL));
6964 __ Str(p10, SVEMemOperand(x0, -3, SVE_MUL_VL));
6965 __ Str(p11, SVEMemOperand(x0, 255, SVE_MUL_VL));
6966
Jacob Bramley6ebbba62019-10-09 15:02:10 +01006967 // Cases that fall back on `CalculateSVEAddress`.
Jacob Bramley199339d2019-08-05 18:49:13 +01006968 __ Str(p12, SVEMemOperand(x0, 6 * pl));
6969 __ Str(p13, SVEMemOperand(x0, -7 * pl));
6970 __ Str(p14, SVEMemOperand(x0, 314, SVE_MUL_VL));
6971 __ Str(p15, SVEMemOperand(x0, -314, SVE_MUL_VL));
6972
6973 // Corresponding loads.
6974 __ Ldr(p0, SVEMemOperand(x0));
6975 __ Ldr(p1, SVEMemOperand(x0, 2, SVE_MUL_VL));
6976 __ Ldr(p2, SVEMemOperand(x0, -3, SVE_MUL_VL));
6977 __ Ldr(p3, SVEMemOperand(x0, 255, SVE_MUL_VL));
6978
6979 __ Ldr(p4, SVEMemOperand(x0, 6 * pl));
6980 __ Ldr(p5, SVEMemOperand(x0, -7 * pl));
6981 __ Ldr(p6, SVEMemOperand(x0, 314, SVE_MUL_VL));
6982 __ Ldr(p7, SVEMemOperand(x0, -314, SVE_MUL_VL));
6983
6984 END();
6985
6986 if (CAN_RUN()) {
6987 RUN();
6988
6989 uint8_t* expected = new uint8_t[data_size];
6990 memset(expected, 0, data_size);
6991 uint8_t* middle = &expected[data_size / 2];
6992
6993 for (int i = 0; i < pl; i++) {
6994 int bit_index = (i % sizeof(pattern[0])) * kBitsPerByte;
6995 size_t index = i / sizeof(pattern[0]);
6996 VIXL_ASSERT(index < ArrayLength(pattern));
6997 uint64_t byte = (pattern[index] >> bit_index) & 0xff;
6998 // Each byte of `pattern` can be multiplied by 15 without carry.
6999 VIXL_ASSERT((byte * 15) <= 0xff);
7000
7001 middle[i] = byte * 8; // p8
7002 middle[(2 * pl) + i] = byte * 9; // p9
7003 middle[(-3 * pl) + i] = byte * 10; // p10
7004 middle[(255 * pl) + i] = byte * 11; // p11
7005 middle[(6 * pl) + i] = byte * 12; // p12
7006 middle[(-7 * pl) + i] = byte * 13; // p13
7007 middle[(314 * pl) + i] = byte * 14; // p14
7008 middle[(-314 * pl) + i] = byte * 15; // p15
7009 }
7010
Jacob Bramley33c99f92019-10-08 15:24:12 +01007011 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
Jacob Bramley199339d2019-08-05 18:49:13 +01007012
7013 ASSERT_EQUAL_SVE(p0, p8);
7014 ASSERT_EQUAL_SVE(p1, p9);
7015 ASSERT_EQUAL_SVE(p2, p10);
7016 ASSERT_EQUAL_SVE(p3, p11);
7017 ASSERT_EQUAL_SVE(p4, p12);
7018 ASSERT_EQUAL_SVE(p5, p13);
7019 ASSERT_EQUAL_SVE(p6, p14);
7020 ASSERT_EQUAL_SVE(p7, p15);
7021
7022 delete[] expected;
7023 }
7024 delete[] data;
7025}
7026
Jacob Bramleye668b202019-08-14 17:57:34 +01007027template <typename T>
7028static void MemoryWrite(uint8_t* base, int64_t offset, int64_t index, T data) {
7029 memcpy(base + offset + (index * sizeof(data)), &data, sizeof(data));
7030}
7031
7032TEST_SVE(sve_ld1_st1_contiguous) {
7033 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7034 START();
7035
7036 int vl = config->sve_vl_in_bytes();
7037
7038 // The immediate can address [-8, 7] times the VL, so allocate enough space to
7039 // exceed that in both directions.
7040 int data_size = vl * 128;
7041
7042 uint8_t* data = new uint8_t[data_size];
7043 memset(data, 0, data_size);
7044
Martyn Capewell452ad8b2020-03-19 15:49:57 +00007045 // Set the base half-way through the buffer so we can use negative indices.
Jacob Bramleye668b202019-08-14 17:57:34 +01007046 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7047
Jacob Bramleye668b202019-08-14 17:57:34 +01007048 // Encodable scalar-plus-immediate cases.
7049 __ Index(z1.VnB(), 1, -3);
7050 __ Ptrue(p1.VnB());
7051 __ St1b(z1.VnB(), p1, SVEMemOperand(x0));
7052
7053 __ Index(z2.VnH(), -2, 5);
7054 __ Ptrue(p2.VnH(), SVE_MUL3);
7055 __ St1b(z2.VnH(), p2, SVEMemOperand(x0, 7, SVE_MUL_VL));
7056
7057 __ Index(z3.VnS(), 3, -7);
7058 __ Ptrue(p3.VnS(), SVE_POW2);
7059 __ St1h(z3.VnS(), p3, SVEMemOperand(x0, -8, SVE_MUL_VL));
7060
7061 // Encodable scalar-plus-scalar cases.
7062 __ Index(z4.VnD(), -4, 11);
7063 __ Ptrue(p4.VnD(), SVE_VL3);
7064 __ Addvl(x1, x0, 8); // Try not to overlap with VL-dependent cases.
7065 __ Mov(x2, 17);
7066 __ St1b(z4.VnD(), p4, SVEMemOperand(x1, x2));
7067
7068 __ Index(z5.VnD(), 6, -2);
7069 __ Ptrue(p5.VnD(), SVE_VL16);
TatWai Chong6205eb42019-09-24 10:07:20 +01007070 __ Addvl(x3, x0, 10); // Try not to overlap with VL-dependent cases.
7071 __ Mov(x4, 6);
7072 __ St1d(z5.VnD(), p5, SVEMemOperand(x3, x4, LSL, 3));
Jacob Bramleye668b202019-08-14 17:57:34 +01007073
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007074 // Unencodable cases fall back on `CalculateSVEAddress`.
Jacob Bramleye668b202019-08-14 17:57:34 +01007075 __ Index(z6.VnS(), -7, 3);
7076 // Setting SVE_ALL on B lanes checks that the Simulator ignores irrelevant
7077 // predicate bits when handling larger lanes.
7078 __ Ptrue(p6.VnB(), SVE_ALL);
7079 __ St1w(z6.VnS(), p6, SVEMemOperand(x0, 42, SVE_MUL_VL));
7080
TatWai Chong6205eb42019-09-24 10:07:20 +01007081 __ Index(z7.VnD(), 32, -11);
7082 __ Ptrue(p7.VnD(), SVE_MUL4);
7083 __ St1w(z7.VnD(), p7, SVEMemOperand(x0, 22, SVE_MUL_VL));
Jacob Bramleye668b202019-08-14 17:57:34 +01007084
TatWai Chong6205eb42019-09-24 10:07:20 +01007085 // Corresponding loads.
7086 __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0));
7087 __ Ld1b(z9.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7088 __ Ld1h(z10.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7089 __ Ld1b(z11.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7090 __ Ld1d(z12.VnD(), p5.Zeroing(), SVEMemOperand(x3, x4, LSL, 3));
7091 __ Ld1w(z13.VnS(), p6.Zeroing(), SVEMemOperand(x0, 42, SVE_MUL_VL));
7092
7093 __ Ld1sb(z14.VnH(), p2.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
7094 __ Ld1sh(z15.VnS(), p3.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
7095 __ Ld1sb(z16.VnD(), p4.Zeroing(), SVEMemOperand(x1, x2));
7096 __ Ld1sw(z17.VnD(), p7.Zeroing(), SVEMemOperand(x0, 22, SVE_MUL_VL));
7097
7098 // We can test ld1 by comparing the value loaded with the value stored. In
7099 // most cases, there are two complications:
7100 // - Loads have zeroing predication, so we have to clear the inactive
7101 // elements on our reference.
7102 // - We have to replicate any sign- or zero-extension.
7103
7104 // Ld1b(z8.VnB(), ...)
7105 __ Dup(z18.VnB(), 0);
7106 __ Mov(z18.VnB(), p1.Merging(), z1.VnB());
7107
7108 // Ld1b(z9.VnH(), ...)
7109 __ Dup(z19.VnH(), 0);
7110 __ Uxtb(z19.VnH(), p2.Merging(), z2.VnH());
7111
7112 // Ld1h(z10.VnS(), ...)
7113 __ Dup(z20.VnS(), 0);
7114 __ Uxth(z20.VnS(), p3.Merging(), z3.VnS());
7115
7116 // Ld1b(z11.VnD(), ...)
7117 __ Dup(z21.VnD(), 0);
7118 __ Uxtb(z21.VnD(), p4.Merging(), z4.VnD());
7119
7120 // Ld1d(z12.VnD(), ...)
7121 __ Dup(z22.VnD(), 0);
7122 __ Mov(z22.VnD(), p5.Merging(), z5.VnD());
7123
7124 // Ld1w(z13.VnS(), ...)
7125 __ Dup(z23.VnS(), 0);
7126 __ Mov(z23.VnS(), p6.Merging(), z6.VnS());
7127
7128 // Ld1sb(z14.VnH(), ...)
7129 __ Dup(z24.VnH(), 0);
7130 __ Sxtb(z24.VnH(), p2.Merging(), z2.VnH());
7131
7132 // Ld1sh(z15.VnS(), ...)
7133 __ Dup(z25.VnS(), 0);
7134 __ Sxth(z25.VnS(), p3.Merging(), z3.VnS());
7135
7136 // Ld1sb(z16.VnD(), ...)
7137 __ Dup(z26.VnD(), 0);
7138 __ Sxtb(z26.VnD(), p4.Merging(), z4.VnD());
7139
7140 // Ld1sw(z17.VnD(), ...)
7141 __ Dup(z27.VnD(), 0);
7142 __ Sxtw(z27.VnD(), p7.Merging(), z7.VnD());
Jacob Bramleye668b202019-08-14 17:57:34 +01007143
7144 END();
7145
7146 if (CAN_RUN()) {
7147 RUN();
7148
7149 uint8_t* expected = new uint8_t[data_size];
7150 memset(expected, 0, data_size);
7151 uint8_t* middle = &expected[data_size / 2];
7152
7153 int vl_b = vl / kBRegSizeInBytes;
7154 int vl_h = vl / kHRegSizeInBytes;
7155 int vl_s = vl / kSRegSizeInBytes;
7156 int vl_d = vl / kDRegSizeInBytes;
7157
7158 // Encodable cases.
7159
7160 // st1b { z1.b }, SVE_ALL
7161 for (int i = 0; i < vl_b; i++) {
7162 MemoryWrite(middle, 0, i, static_cast<uint8_t>(1 - (3 * i)));
7163 }
7164
7165 // st1b { z2.h }, SVE_MUL3
7166 int vl_h_mul3 = vl_h - (vl_h % 3);
7167 for (int i = 0; i < vl_h_mul3; i++) {
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007168 int64_t offset = 7 * static_cast<int>(vl / (kHRegSize / kBRegSize));
7169 MemoryWrite(middle, offset, i, static_cast<uint8_t>(-2 + (5 * i)));
Jacob Bramleye668b202019-08-14 17:57:34 +01007170 }
7171
7172 // st1h { z3.s }, SVE_POW2
7173 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7174 for (int i = 0; i < vl_s_pow2; i++) {
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007175 int64_t offset = -8 * static_cast<int>(vl / (kSRegSize / kHRegSize));
7176 MemoryWrite(middle, offset, i, static_cast<uint16_t>(3 - (7 * i)));
Jacob Bramleye668b202019-08-14 17:57:34 +01007177 }
7178
7179 // st1b { z4.d }, SVE_VL3
7180 if (vl_d >= 3) {
7181 for (int i = 0; i < 3; i++) {
7182 MemoryWrite(middle,
7183 (8 * vl) + 17,
7184 i,
7185 static_cast<uint8_t>(-4 + (11 * i)));
7186 }
7187 }
7188
7189 // st1d { z5.d }, SVE_VL16
7190 if (vl_d >= 16) {
7191 for (int i = 0; i < 16; i++) {
7192 MemoryWrite(middle,
7193 (10 * vl) + (6 * kDRegSizeInBytes),
7194 i,
7195 static_cast<uint64_t>(6 - (2 * i)));
7196 }
7197 }
7198
7199 // Unencodable cases.
7200
7201 // st1w { z6.s }, SVE_ALL
7202 for (int i = 0; i < vl_s; i++) {
7203 MemoryWrite(middle, 42 * vl, i, static_cast<uint32_t>(-7 + (3 * i)));
7204 }
7205
TatWai Chong6205eb42019-09-24 10:07:20 +01007206 // st1w { z7.d }, SVE_MUL4
7207 int vl_d_mul4 = vl_d - (vl_d % 4);
7208 for (int i = 0; i < vl_d_mul4; i++) {
Jacob Bramley6ebbba62019-10-09 15:02:10 +01007209 int64_t offset = 22 * static_cast<int>(vl / (kDRegSize / kWRegSize));
7210 MemoryWrite(middle, offset, i, static_cast<uint32_t>(32 + (-11 * i)));
TatWai Chong6205eb42019-09-24 10:07:20 +01007211 }
7212
Jacob Bramley33c99f92019-10-08 15:24:12 +01007213 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
Jacob Bramleye668b202019-08-14 17:57:34 +01007214
TatWai Chong6205eb42019-09-24 10:07:20 +01007215 // Check that we loaded back the expected values.
7216
7217 ASSERT_EQUAL_SVE(z18, z8);
7218 ASSERT_EQUAL_SVE(z19, z9);
7219 ASSERT_EQUAL_SVE(z20, z10);
7220 ASSERT_EQUAL_SVE(z21, z11);
7221 ASSERT_EQUAL_SVE(z22, z12);
7222 ASSERT_EQUAL_SVE(z23, z13);
7223 ASSERT_EQUAL_SVE(z24, z14);
7224 ASSERT_EQUAL_SVE(z25, z15);
7225 ASSERT_EQUAL_SVE(z26, z16);
7226 ASSERT_EQUAL_SVE(z27, z17);
7227
Jacob Bramleye668b202019-08-14 17:57:34 +01007228 delete[] expected;
7229 }
7230 delete[] data;
7231}
7232
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007233TEST_SVE(sve_ld2_st2_scalar_plus_imm) {
7234 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7235 START();
7236
7237 int vl = config->sve_vl_in_bytes();
7238
7239 // The immediate can address [-16, 14] times the VL, so allocate enough space
7240 // to exceed that in both directions.
7241 int data_size = vl * 128;
7242
7243 uint8_t* data = new uint8_t[data_size];
7244 memset(data, 0, data_size);
7245
Josh Sorefb43d6ef2022-08-03 12:47:14 -04007246 // Set the base half-way through the buffer so we can use negative indices.
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007247 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7248
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007249 __ Index(z14.VnB(), 1, -3);
7250 __ Index(z15.VnB(), 2, -3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007251 __ Ptrue(p0.VnB());
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007252 __ St2b(z14.VnB(), z15.VnB(), p0, SVEMemOperand(x0));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007253
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007254 __ Index(z16.VnH(), -2, 5);
7255 __ Index(z17.VnH(), -3, 5);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007256 __ Ptrue(p1.VnH(), SVE_MUL3);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007257 __ St2h(z16.VnH(), z17.VnH(), p1, SVEMemOperand(x0, 8, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007258
7259 // Wrap around from z31 to z0.
7260 __ Index(z31.VnS(), 3, -7);
7261 __ Index(z0.VnS(), 4, -7);
7262 __ Ptrue(p2.VnS(), SVE_POW2);
7263 __ St2w(z31.VnS(), z0.VnS(), p2, SVEMemOperand(x0, -12, SVE_MUL_VL));
7264
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007265 __ Index(z18.VnD(), -7, 3);
7266 __ Index(z19.VnD(), -8, 3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007267 // Sparse predication, including some irrelevant bits (0xe). To make the
7268 // results easy to check, activate each lane <n> where n is a multiple of 5.
7269 Initialise(&masm,
7270 p3,
7271 0xeee10000000001ee,
7272 0xeeeeeee100000000,
7273 0x01eeeeeeeee10000,
7274 0x000001eeeeeeeee1);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007275 __ St2d(z18.VnD(), z19.VnD(), p3, SVEMemOperand(x0, 14, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007276
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007277 // We can test ld2 by comparing the values loaded with the values stored.
7278 // There are two complications:
7279 // - Loads have zeroing predication, so we have to clear the inactive
7280 // elements on our reference.
7281 // - We want to test both loads and stores that span { z31, z0 }, so we have
7282 // to move some values around.
7283 //
7284 // Registers z4-z11 will hold as-stored values (with inactive elements
7285 // cleared). Registers z20-z27 will hold the values that were loaded.
7286
7287 // Ld2b(z14.VnB(), z15.VnB(), ...)
7288 __ Dup(z4.VnB(), 0);
7289 __ Dup(z5.VnB(), 0);
7290 __ Mov(z4.VnB(), p0.Merging(), z14.VnB());
7291 __ Mov(z5.VnB(), p0.Merging(), z15.VnB());
7292
7293 // Ld2h(z16.VnH(), z17.VnH(), ...)
7294 __ Dup(z6.VnH(), 0);
7295 __ Dup(z7.VnH(), 0);
7296 __ Mov(z6.VnH(), p1.Merging(), z16.VnH());
7297 __ Mov(z7.VnH(), p1.Merging(), z17.VnH());
7298
7299 // Ld2w(z31.VnS(), z0.VnS(), ...)
7300 __ Dup(z8.VnS(), 0);
7301 __ Dup(z9.VnS(), 0);
7302 __ Mov(z8.VnS(), p2.Merging(), z31.VnS());
7303 __ Mov(z9.VnS(), p2.Merging(), z0.VnS());
7304
7305 // Ld2d(z18.VnD(), z19.VnD(), ...)
7306 __ Dup(z10.VnD(), 0);
7307 __ Dup(z11.VnD(), 0);
7308 __ Mov(z10.VnD(), p3.Merging(), z18.VnD());
7309 __ Mov(z11.VnD(), p3.Merging(), z19.VnD());
7310
7311 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7312 __ Ld2b(z31.VnB(), z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7313 __ Mov(z20, z31);
7314 __ Mov(z21, z0);
7315
7316 __ Ld2h(z22.VnH(), z23.VnH(), p1.Zeroing(), SVEMemOperand(x0, 8, SVE_MUL_VL));
7317 __ Ld2w(z24.VnS(),
7318 z25.VnS(),
7319 p2.Zeroing(),
7320 SVEMemOperand(x0, -12, SVE_MUL_VL));
7321 __ Ld2d(z26.VnD(),
7322 z27.VnD(),
7323 p3.Zeroing(),
7324 SVEMemOperand(x0, 14, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007325
7326 END();
7327
7328 if (CAN_RUN()) {
7329 RUN();
7330
7331 uint8_t* expected = new uint8_t[data_size];
7332 memset(expected, 0, data_size);
7333 uint8_t* middle = &expected[data_size / 2];
7334
7335 int vl_b = vl / kBRegSizeInBytes;
7336 int vl_h = vl / kHRegSizeInBytes;
7337 int vl_s = vl / kSRegSizeInBytes;
7338 int vl_d = vl / kDRegSizeInBytes;
7339
7340 int reg_count = 2;
7341
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007342 // st2b { z14.b, z15.b }, SVE_ALL
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007343 for (int i = 0; i < vl_b; i++) {
7344 uint8_t lane0 = 1 - (3 * i);
7345 uint8_t lane1 = 2 - (3 * i);
7346 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7347 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7348 }
7349
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007350 // st2h { z16.h, z17.h }, SVE_MUL3
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007351 int vl_h_mul3 = vl_h - (vl_h % 3);
7352 for (int i = 0; i < vl_h_mul3; i++) {
7353 int64_t offset = 8 * vl;
7354 uint16_t lane0 = -2 + (5 * i);
7355 uint16_t lane1 = -3 + (5 * i);
7356 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7357 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7358 }
7359
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007360 // st2w { z31.s, z0.s }, SVE_POW2
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007361 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7362 for (int i = 0; i < vl_s_pow2; i++) {
7363 int64_t offset = -12 * vl;
7364 uint32_t lane0 = 3 - (7 * i);
7365 uint32_t lane1 = 4 - (7 * i);
7366 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7367 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7368 }
7369
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007370 // st2d { z18.d, z19.d }, ((i % 5) == 0)
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007371 for (int i = 0; i < vl_d; i++) {
7372 if ((i % 5) == 0) {
7373 int64_t offset = 14 * vl;
7374 uint64_t lane0 = -7 + (3 * i);
7375 uint64_t lane1 = -8 + (3 * i);
7376 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7377 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7378 }
7379 }
7380
7381 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7382
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007383 // Check that we loaded back the expected values.
7384
7385 // st2b/ld2b
7386 ASSERT_EQUAL_SVE(z4, z20);
7387 ASSERT_EQUAL_SVE(z5, z21);
7388
7389 // st2h/ld2h
7390 ASSERT_EQUAL_SVE(z6, z22);
7391 ASSERT_EQUAL_SVE(z7, z23);
7392
7393 // st2w/ld2w
7394 ASSERT_EQUAL_SVE(z8, z24);
7395 ASSERT_EQUAL_SVE(z9, z25);
7396
7397 // st2d/ld2d
7398 ASSERT_EQUAL_SVE(z10, z26);
7399 ASSERT_EQUAL_SVE(z11, z27);
7400
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007401 delete[] expected;
7402 }
7403 delete[] data;
7404}
7405
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007406TEST_SVE(sve_ld2_st2_scalar_plus_scalar) {
7407 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7408 START();
7409
7410 int vl = config->sve_vl_in_bytes();
7411
7412 // Allocate plenty of space to enable indexing in both directions.
7413 int data_size = vl * 128;
7414
7415 uint8_t* data = new uint8_t[data_size];
7416 memset(data, 0, data_size);
7417
Josh Sorefb43d6ef2022-08-03 12:47:14 -04007418 // Set the base half-way through the buffer so we can use negative indices.
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007419 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7420
Jacob Bramleye483ce52019-11-05 16:52:29 +00007421 __ Index(z10.VnB(), -4, 11);
7422 __ Index(z11.VnB(), -5, 11);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007423 __ Ptrue(p7.VnB(), SVE_MUL4);
7424 __ Mov(x1, 0);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007425 __ St2b(z10.VnB(), z11.VnB(), p7, SVEMemOperand(x0, x1));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007426
Jacob Bramleye483ce52019-11-05 16:52:29 +00007427 __ Index(z12.VnH(), 6, -2);
7428 __ Index(z13.VnH(), 7, -2);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007429 __ Ptrue(p6.VnH(), SVE_VL16);
7430 __ Rdvl(x2, 3); // Make offsets VL-dependent so we can avoid overlap.
Jacob Bramleye483ce52019-11-05 16:52:29 +00007431 __ St2h(z12.VnH(), z13.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007432
Jacob Bramleye483ce52019-11-05 16:52:29 +00007433 __ Index(z14.VnS(), -7, 3);
7434 __ Index(z15.VnS(), -8, 3);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007435 // Sparse predication, including some irrelevant bits (0xe). To make the
7436 // results easy to check, activate each lane <n> where n is a multiple of 5.
7437 Initialise(&masm,
7438 p5,
7439 0xeee1000010000100,
7440 0x001eeee100001000,
7441 0x0100001eeee10000,
7442 0x10000100001eeee1);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007443 __ Rdvl(x3, -3);
7444 __ St2w(z14.VnS(), z15.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007445
7446 // Wrap around from z31 to z0.
7447 __ Index(z31.VnD(), 32, -11);
7448 __ Index(z0.VnD(), 33, -11);
7449 __ Ptrue(p4.VnD(), SVE_MUL3);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007450 __ Rdvl(x4, 1);
7451 __ St2d(z31.VnD(), z0.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007452
Jacob Bramleye483ce52019-11-05 16:52:29 +00007453 // We can test ld2 by comparing the values loaded with the values stored.
7454 // There are two complications:
7455 // - Loads have zeroing predication, so we have to clear the inactive
7456 // elements on our reference.
7457 // - We want to test both loads and stores that span { z31, z0 }, so we have
7458 // to move some values around.
7459 //
7460 // Registers z4-z11 will hold as-stored values (with inactive elements
7461 // cleared). Registers z20-z27 will hold the values that were loaded.
7462
7463 // Ld2b(z20.VnB(), z21.VnB(), ...)
7464 __ Dup(z4.VnB(), 0);
7465 __ Dup(z5.VnB(), 0);
7466 __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7467 __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7468
7469 // Ld2h(z22.VnH(), z23.VnH(), ...)
7470 __ Dup(z6.VnH(), 0);
7471 __ Dup(z7.VnH(), 0);
7472 __ Mov(z6.VnH(), p6.Merging(), z12.VnH());
7473 __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7474
7475 // Ld2w(z24.VnS(), z25.VnS(), ...)
7476 __ Dup(z8.VnS(), 0);
7477 __ Dup(z9.VnS(), 0);
7478 __ Mov(z8.VnS(), p5.Merging(), z14.VnS());
7479 __ Mov(z9.VnS(), p5.Merging(), z15.VnS());
7480
7481 // Ld2d(z31.VnD(), z0.VnD(), ...)
7482 __ Dup(z10.VnD(), 0);
7483 __ Dup(z11.VnD(), 0);
7484 __ Mov(z10.VnD(), p4.Merging(), z31.VnD());
7485 __ Mov(z11.VnD(), p4.Merging(), z0.VnD());
7486
7487 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7488 __ Ld2b(z31.VnB(), z0.VnB(), p7.Zeroing(), SVEMemOperand(x0, x1));
7489 __ Mov(z20, z31);
7490 __ Mov(z21, z0);
7491
7492 __ Ld2h(z22.VnH(), z23.VnH(), p6.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
7493 __ Ld2w(z24.VnS(), z25.VnS(), p5.Zeroing(), SVEMemOperand(x0, x3, LSL, 2));
7494 __ Ld2d(z26.VnD(), z27.VnD(), p4.Zeroing(), SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007495
7496 END();
7497
7498 if (CAN_RUN()) {
7499 RUN();
7500
7501 uint8_t* expected = new uint8_t[data_size];
7502 memset(expected, 0, data_size);
7503 uint8_t* middle = &expected[data_size / 2];
7504
7505 int vl_b = vl / kBRegSizeInBytes;
7506 int vl_h = vl / kHRegSizeInBytes;
7507 int vl_s = vl / kSRegSizeInBytes;
7508 int vl_d = vl / kDRegSizeInBytes;
7509
7510 int reg_count = 2;
7511
Jacob Bramleye483ce52019-11-05 16:52:29 +00007512 // st2b { z10.b, z11.b }, SVE_MUL4
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007513 int vl_b_mul4 = vl_b - (vl_b % 4);
7514 for (int i = 0; i < vl_b_mul4; i++) {
7515 uint8_t lane0 = -4 + (11 * i);
7516 uint8_t lane1 = -5 + (11 * i);
7517 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7518 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7519 }
7520
Jacob Bramleye483ce52019-11-05 16:52:29 +00007521 // st2h { z12.h, z13.h }, SVE_VL16
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007522 if (vl_h >= 16) {
7523 for (int i = 0; i < 16; i++) {
7524 int64_t offset = (3 << kHRegSizeInBytesLog2) * vl;
7525 uint16_t lane0 = 6 - (2 * i);
7526 uint16_t lane1 = 7 - (2 * i);
7527 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7528 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7529 }
7530 }
7531
Jacob Bramleye483ce52019-11-05 16:52:29 +00007532 // st2w { z14.s, z15.s }, ((i % 5) == 0)
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007533 for (int i = 0; i < vl_s; i++) {
7534 if ((i % 5) == 0) {
7535 int64_t offset = -(3 << kSRegSizeInBytesLog2) * vl;
7536 uint32_t lane0 = -7 + (3 * i);
7537 uint32_t lane1 = -8 + (3 * i);
7538 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7539 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7540 }
7541 }
7542
7543 // st2d { z31.b, z0.b }, SVE_MUL3
7544 int vl_d_mul3 = vl_d - (vl_d % 3);
7545 for (int i = 0; i < vl_d_mul3; i++) {
7546 int64_t offset = (1 << kDRegSizeInBytesLog2) * vl;
7547 uint64_t lane0 = 32 - (11 * i);
7548 uint64_t lane1 = 33 - (11 * i);
7549 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7550 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7551 }
7552
7553 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7554
Jacob Bramleye483ce52019-11-05 16:52:29 +00007555 // Check that we loaded back the expected values.
7556
7557 // st2b/ld2b
7558 ASSERT_EQUAL_SVE(z4, z20);
7559 ASSERT_EQUAL_SVE(z5, z21);
7560
7561 // st2h/ld2h
7562 ASSERT_EQUAL_SVE(z6, z22);
7563 ASSERT_EQUAL_SVE(z7, z23);
7564
7565 // st2w/ld2w
7566 ASSERT_EQUAL_SVE(z8, z24);
7567 ASSERT_EQUAL_SVE(z9, z25);
7568
7569 // st2d/ld2d
7570 ASSERT_EQUAL_SVE(z10, z26);
7571 ASSERT_EQUAL_SVE(z11, z27);
7572
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007573 delete[] expected;
7574 }
7575 delete[] data;
7576}
7577
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007578TEST_SVE(sve_ld3_st3_scalar_plus_imm) {
7579 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7580 START();
7581
7582 int vl = config->sve_vl_in_bytes();
7583
7584 // The immediate can address [-24, 21] times the VL, so allocate enough space
7585 // to exceed that in both directions.
7586 int data_size = vl * 128;
7587
7588 uint8_t* data = new uint8_t[data_size];
7589 memset(data, 0, data_size);
7590
Josh Sorefb43d6ef2022-08-03 12:47:14 -04007591 // Set the base half-way through the buffer so we can use negative indices.
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007592 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7593
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007594 // We can test ld3 by comparing the values loaded with the values stored.
7595 // There are two complications:
7596 // - Loads have zeroing predication, so we have to clear the inactive
7597 // elements on our reference.
7598 // - We want to test both loads and stores that span { z31, z0 }, so we have
7599 // to move some values around.
7600 //
7601 // Registers z4-z15 will hold as-stored values (with inactive elements
7602 // cleared). Registers z16-z27 will hold the values that were loaded.
7603
7604 __ Index(z10.VnB(), 1, -3);
7605 __ Index(z11.VnB(), 2, -3);
7606 __ Index(z12.VnB(), 3, -3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007607 __ Ptrue(p0.VnB());
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007608 __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p0, SVEMemOperand(x0));
7609 // Save the stored values for ld3 tests.
7610 __ Dup(z4.VnB(), 0);
7611 __ Dup(z5.VnB(), 0);
7612 __ Dup(z6.VnB(), 0);
7613 __ Mov(z4.VnB(), p0.Merging(), z10.VnB());
7614 __ Mov(z5.VnB(), p0.Merging(), z11.VnB());
7615 __ Mov(z6.VnB(), p0.Merging(), z12.VnB());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007616
7617 // Wrap around from z31 to z0.
7618 __ Index(z31.VnH(), -2, 5);
7619 __ Index(z0.VnH(), -3, 5);
7620 __ Index(z1.VnH(), -4, 5);
7621 __ Ptrue(p1.VnH(), SVE_MUL3);
7622 __ St3h(z31.VnH(), z0.VnH(), z1.VnH(), p1, SVEMemOperand(x0, 9, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007623 // Save the stored values for ld3 tests.
7624 __ Dup(z7.VnH(), 0);
7625 __ Dup(z8.VnH(), 0);
7626 __ Dup(z9.VnH(), 0);
7627 __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
7628 __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
7629 __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007630
7631 __ Index(z30.VnS(), 3, -7);
7632 __ Index(z31.VnS(), 4, -7);
7633 __ Index(z0.VnS(), 5, -7);
7634 __ Ptrue(p2.VnS(), SVE_POW2);
7635 __ St3w(z30.VnS(),
7636 z31.VnS(),
7637 z0.VnS(),
7638 p2,
7639 SVEMemOperand(x0, -12, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007640 // Save the stored values for ld3 tests.
7641 __ Dup(z10.VnS(), 0);
7642 __ Dup(z11.VnS(), 0);
7643 __ Dup(z12.VnS(), 0);
7644 __ Mov(z10.VnS(), p2.Merging(), z30.VnS());
7645 __ Mov(z11.VnS(), p2.Merging(), z31.VnS());
7646 __ Mov(z12.VnS(), p2.Merging(), z0.VnS());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007647
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007648 __ Index(z0.VnD(), -7, 3);
7649 __ Index(z1.VnD(), -8, 3);
7650 __ Index(z2.VnD(), -9, 3);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007651 // Sparse predication, including some irrelevant bits (0xee). To make the
7652 // results easy to check, activate each lane <n> where n is a multiple of 5.
7653 Initialise(&masm,
7654 p3,
7655 0xeee10000000001ee,
7656 0xeeeeeee100000000,
7657 0x01eeeeeeeee10000,
7658 0x000001eeeeeeeee1);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007659 __ St3d(z0.VnD(), z1.VnD(), z2.VnD(), p3, SVEMemOperand(x0, 15, SVE_MUL_VL));
7660 // Save the stored values for ld3 tests.
7661 __ Dup(z13.VnD(), 0);
7662 __ Dup(z14.VnD(), 0);
7663 __ Dup(z15.VnD(), 0);
7664 __ Mov(z13.VnD(), p3.Merging(), z0.VnD());
7665 __ Mov(z14.VnD(), p3.Merging(), z1.VnD());
7666 __ Mov(z15.VnD(), p3.Merging(), z2.VnD());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007667
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007668 // Corresponding loads.
7669 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7670 __ Ld3b(z31.VnB(), z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(x0));
7671 __ Mov(z16, z31);
7672 __ Mov(z17, z0);
7673 __ Mov(z18, z1);
7674 __ Ld3h(z30.VnH(),
7675 z31.VnH(),
7676 z0.VnH(),
7677 p1.Zeroing(),
7678 SVEMemOperand(x0, 9, SVE_MUL_VL));
7679 __ Mov(z19, z30);
7680 __ Mov(z20, z31);
7681 __ Mov(z21, z0);
7682 __ Ld3w(z22.VnS(),
7683 z23.VnS(),
7684 z24.VnS(),
7685 p2.Zeroing(),
7686 SVEMemOperand(x0, -12, SVE_MUL_VL));
7687 __ Ld3d(z25.VnD(),
7688 z26.VnD(),
7689 z27.VnD(),
7690 p3.Zeroing(),
7691 SVEMemOperand(x0, 15, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007692
7693 END();
7694
7695 if (CAN_RUN()) {
7696 RUN();
7697
7698 uint8_t* expected = new uint8_t[data_size];
7699 memset(expected, 0, data_size);
7700 uint8_t* middle = &expected[data_size / 2];
7701
7702 int vl_b = vl / kBRegSizeInBytes;
7703 int vl_h = vl / kHRegSizeInBytes;
7704 int vl_s = vl / kSRegSizeInBytes;
7705 int vl_d = vl / kDRegSizeInBytes;
7706
7707 int reg_count = 3;
7708
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007709 // st3b { z10.b, z11.b, z12.b }, SVE_ALL
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007710 for (int i = 0; i < vl_b; i++) {
7711 uint8_t lane0 = 1 - (3 * i);
7712 uint8_t lane1 = 2 - (3 * i);
7713 uint8_t lane2 = 3 - (3 * i);
7714 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
7715 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
7716 MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
7717 }
7718
7719 // st3h { z31.h, z0.h, z1.h }, SVE_MUL3
7720 int vl_h_mul3 = vl_h - (vl_h % 3);
7721 for (int i = 0; i < vl_h_mul3; i++) {
7722 int64_t offset = 9 * vl;
7723 uint16_t lane0 = -2 + (5 * i);
7724 uint16_t lane1 = -3 + (5 * i);
7725 uint16_t lane2 = -4 + (5 * i);
7726 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7727 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7728 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7729 }
7730
7731 // st3w { z30.s, z31.s, z0.s }, SVE_POW2
7732 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
7733 for (int i = 0; i < vl_s_pow2; i++) {
7734 int64_t offset = -12 * vl;
7735 uint32_t lane0 = 3 - (7 * i);
7736 uint32_t lane1 = 4 - (7 * i);
7737 uint32_t lane2 = 5 - (7 * i);
7738 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7739 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7740 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7741 }
7742
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007743 // st3d { z0.d, z1.d, z2.d }, ((i % 5) == 0)
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007744 for (int i = 0; i < vl_d; i++) {
7745 if ((i % 5) == 0) {
7746 int64_t offset = 15 * vl;
7747 uint64_t lane0 = -7 + (3 * i);
7748 uint64_t lane1 = -8 + (3 * i);
7749 uint64_t lane2 = -9 + (3 * i);
7750 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7751 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7752 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7753 }
7754 }
7755
7756 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7757
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00007758 // Check that we loaded back the expected values.
7759
7760 // st3b/ld3b
7761 ASSERT_EQUAL_SVE(z4, z16);
7762 ASSERT_EQUAL_SVE(z5, z17);
7763 ASSERT_EQUAL_SVE(z6, z18);
7764
7765 // st3h/ld3h
7766 ASSERT_EQUAL_SVE(z7, z19);
7767 ASSERT_EQUAL_SVE(z8, z20);
7768 ASSERT_EQUAL_SVE(z9, z21);
7769
7770 // st3w/ld3w
7771 ASSERT_EQUAL_SVE(z10, z22);
7772 ASSERT_EQUAL_SVE(z11, z23);
7773 ASSERT_EQUAL_SVE(z12, z24);
7774
7775 // st3d/ld3d
7776 ASSERT_EQUAL_SVE(z13, z25);
7777 ASSERT_EQUAL_SVE(z14, z26);
7778 ASSERT_EQUAL_SVE(z15, z27);
7779
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007780 delete[] expected;
7781 }
7782 delete[] data;
7783}
7784
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007785TEST_SVE(sve_ld3_st3_scalar_plus_scalar) {
7786 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
7787 START();
7788
7789 int vl = config->sve_vl_in_bytes();
7790
7791 // Allocate plenty of space to enable indexing in both directions.
7792 int data_size = vl * 128;
7793
7794 uint8_t* data = new uint8_t[data_size];
7795 memset(data, 0, data_size);
7796
Josh Sorefb43d6ef2022-08-03 12:47:14 -04007797 // Set the base half-way through the buffer so we can use negative indices.
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007798 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
7799
Jacob Bramleye483ce52019-11-05 16:52:29 +00007800 // We can test ld3 by comparing the values loaded with the values stored.
7801 // There are two complications:
7802 // - Loads have zeroing predication, so we have to clear the inactive
7803 // elements on our reference.
7804 // - We want to test both loads and stores that span { z31, z0 }, so we have
7805 // to move some values around.
7806 //
7807 // Registers z4-z15 will hold as-stored values (with inactive elements
7808 // cleared). Registers z16-z27 will hold the values that were loaded.
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007809
Jacob Bramleye483ce52019-11-05 16:52:29 +00007810 __ Index(z10.VnB(), -4, 11);
7811 __ Index(z11.VnB(), -5, 11);
7812 __ Index(z12.VnB(), -6, 11);
7813 __ Ptrue(p7.VnB(), SVE_MUL4);
7814 __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
7815 __ St3b(z10.VnB(), z11.VnB(), z12.VnB(), p7, SVEMemOperand(x0, x1, LSL, 0));
7816 // Save the stored values for ld3 tests.
7817 __ Dup(z4.VnB(), 0);
7818 __ Dup(z5.VnB(), 0);
7819 __ Dup(z6.VnB(), 0);
7820 __ Mov(z4.VnB(), p7.Merging(), z10.VnB());
7821 __ Mov(z5.VnB(), p7.Merging(), z11.VnB());
7822 __ Mov(z6.VnB(), p7.Merging(), z12.VnB());
7823
7824 __ Index(z13.VnH(), 6, -2);
7825 __ Index(z14.VnH(), 7, -2);
7826 __ Index(z15.VnH(), 8, -2);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007827 __ Ptrue(p6.VnH(), SVE_VL16);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007828 __ Rdvl(x2, 5); // (5 * vl) << 1 = 10 * vl
7829 __ St3h(z13.VnH(), z14.VnH(), z15.VnH(), p6, SVEMemOperand(x0, x2, LSL, 1));
7830 // Save the stored values for ld3 tests.
7831 __ Dup(z7.VnH(), 0);
7832 __ Dup(z8.VnH(), 0);
7833 __ Dup(z9.VnH(), 0);
7834 __ Mov(z7.VnH(), p6.Merging(), z13.VnH());
7835 __ Mov(z8.VnH(), p6.Merging(), z14.VnH());
7836 __ Mov(z9.VnH(), p6.Merging(), z15.VnH());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007837
7838 // Wrap around from z31 to z0.
7839 __ Index(z30.VnS(), -7, 3);
7840 __ Index(z31.VnS(), -8, 3);
7841 __ Index(z0.VnS(), -9, 3);
7842 // Sparse predication, including some irrelevant bits (0xe). To make the
7843 // results easy to check, activate each lane <n> where n is a multiple of 5.
7844 Initialise(&masm,
7845 p5,
7846 0xeee1000010000100,
7847 0x001eeee100001000,
7848 0x0100001eeee10000,
7849 0x10000100001eeee1);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007850 __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
7851 __ St3w(z30.VnS(), z31.VnS(), z0.VnS(), p5, SVEMemOperand(x0, x3, LSL, 2));
7852 // Save the stored values for ld3 tests.
7853 __ Dup(z10.VnS(), 0);
7854 __ Dup(z11.VnS(), 0);
7855 __ Dup(z12.VnS(), 0);
7856 __ Mov(z10.VnS(), p5.Merging(), z30.VnS());
7857 __ Mov(z11.VnS(), p5.Merging(), z31.VnS());
7858 __ Mov(z12.VnS(), p5.Merging(), z0.VnS());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007859
7860 __ Index(z31.VnD(), 32, -11);
7861 __ Index(z0.VnD(), 33, -11);
7862 __ Index(z1.VnD(), 34, -11);
7863 __ Ptrue(p4.VnD(), SVE_MUL3);
Jacob Bramleye483ce52019-11-05 16:52:29 +00007864 __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 * vl
7865 __ St3d(z31.VnD(), z0.VnD(), z1.VnD(), p4, SVEMemOperand(x0, x4, LSL, 3));
7866 // Save the stored values for ld3 tests.
7867 __ Dup(z13.VnD(), 0);
7868 __ Dup(z14.VnD(), 0);
7869 __ Dup(z15.VnD(), 0);
7870 __ Mov(z13.VnD(), p4.Merging(), z31.VnD());
7871 __ Mov(z14.VnD(), p4.Merging(), z0.VnD());
7872 __ Mov(z15.VnD(), p4.Merging(), z1.VnD());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007873
Jacob Bramleye483ce52019-11-05 16:52:29 +00007874 // Corresponding loads.
7875 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
7876 __ Ld3b(z31.VnB(),
7877 z0.VnB(),
7878 z1.VnB(),
7879 p7.Zeroing(),
7880 SVEMemOperand(x0, x1, LSL, 0));
7881 __ Mov(z16, z31);
7882 __ Mov(z17, z0);
7883 __ Mov(z18, z1);
7884 __ Ld3h(z30.VnH(),
7885 z31.VnH(),
7886 z0.VnH(),
7887 p6.Zeroing(),
7888 SVEMemOperand(x0, x2, LSL, 1));
7889 __ Mov(z19, z30);
7890 __ Mov(z20, z31);
7891 __ Mov(z21, z0);
7892 __ Ld3w(z22.VnS(),
7893 z23.VnS(),
7894 z24.VnS(),
7895 p5.Zeroing(),
7896 SVEMemOperand(x0, x3, LSL, 2));
7897 __ Ld3d(z25.VnD(),
7898 z26.VnD(),
7899 z27.VnD(),
7900 p4.Zeroing(),
7901 SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007902
7903 END();
7904
7905 if (CAN_RUN()) {
7906 RUN();
7907
7908 uint8_t* expected = new uint8_t[data_size];
7909 memset(expected, 0, data_size);
7910 uint8_t* middle = &expected[data_size / 2];
7911
7912 int vl_b = vl / kBRegSizeInBytes;
7913 int vl_h = vl / kHRegSizeInBytes;
7914 int vl_s = vl / kSRegSizeInBytes;
7915 int vl_d = vl / kDRegSizeInBytes;
7916
7917 int reg_count = 3;
7918
Jacob Bramleye483ce52019-11-05 16:52:29 +00007919 // st3b { z10.b, z11.b, z12.b }, SVE_MUL4
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007920 int vl_b_mul4 = vl_b - (vl_b % 4);
7921 for (int i = 0; i < vl_b_mul4; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00007922 int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007923 uint8_t lane0 = -4 + (11 * i);
7924 uint8_t lane1 = -5 + (11 * i);
7925 uint8_t lane2 = -6 + (11 * i);
7926 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7927 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7928 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7929 }
7930
Jacob Bramleye483ce52019-11-05 16:52:29 +00007931 // st3h { z13.h, z14.h, z15.h }, SVE_VL16
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007932 if (vl_h >= 16) {
7933 for (int i = 0; i < 16; i++) {
7934 int64_t offset = (5 << kHRegSizeInBytesLog2) * vl;
7935 uint16_t lane0 = 6 - (2 * i);
7936 uint16_t lane1 = 7 - (2 * i);
7937 uint16_t lane2 = 8 - (2 * i);
7938 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7939 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7940 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7941 }
7942 }
7943
7944 // st3w { z30.s, z31.s, z0.s }, ((i % 5) == 0)
7945 for (int i = 0; i < vl_s; i++) {
7946 if ((i % 5) == 0) {
7947 int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
7948 uint32_t lane0 = -7 + (3 * i);
7949 uint32_t lane1 = -8 + (3 * i);
7950 uint32_t lane2 = -9 + (3 * i);
7951 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7952 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7953 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7954 }
7955 }
7956
7957 // st3d { z31.d, z0.d, z1.d }, SVE_MUL3
7958 int vl_d_mul3 = vl_d - (vl_d % 3);
7959 for (int i = 0; i < vl_d_mul3; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00007960 int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007961 uint64_t lane0 = 32 - (11 * i);
7962 uint64_t lane1 = 33 - (11 * i);
7963 uint64_t lane2 = 34 - (11 * i);
7964 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
7965 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
7966 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
7967 }
7968
7969 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
7970
Jacob Bramleye483ce52019-11-05 16:52:29 +00007971 // Check that we loaded back the expected values.
7972
7973 // st3b/ld3b
7974 ASSERT_EQUAL_SVE(z4, z16);
7975 ASSERT_EQUAL_SVE(z5, z17);
7976 ASSERT_EQUAL_SVE(z6, z18);
7977
7978 // st3h/ld3h
7979 ASSERT_EQUAL_SVE(z7, z19);
7980 ASSERT_EQUAL_SVE(z8, z20);
7981 ASSERT_EQUAL_SVE(z9, z21);
7982
7983 // st3w/ld3w
7984 ASSERT_EQUAL_SVE(z10, z22);
7985 ASSERT_EQUAL_SVE(z11, z23);
7986 ASSERT_EQUAL_SVE(z12, z24);
7987
7988 // st3d/ld3d
7989 ASSERT_EQUAL_SVE(z13, z25);
7990 ASSERT_EQUAL_SVE(z14, z26);
7991 ASSERT_EQUAL_SVE(z15, z27);
7992
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00007993 delete[] expected;
7994 }
7995 delete[] data;
7996}
7997
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00007998TEST_SVE(sve_ld4_st4_scalar_plus_imm) {
7999 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8000 START();
8001
8002 int vl = config->sve_vl_in_bytes();
8003
8004 // The immediate can address [-24, 21] times the VL, so allocate enough space
8005 // to exceed that in both directions.
8006 int data_size = vl * 128;
8007
8008 uint8_t* data = new uint8_t[data_size];
8009 memset(data, 0, data_size);
8010
Josh Sorefb43d6ef2022-08-03 12:47:14 -04008011 // Set the base half-way through the buffer so we can use negative indices.
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008012 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8013
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008014 // We can test ld4 by comparing the values loaded with the values stored.
8015 // There are two complications:
8016 // - Loads have zeroing predication, so we have to clear the inactive
8017 // elements on our reference.
8018 // - We want to test both loads and stores that span { z31, z0 }, so we have
8019 // to move some values around.
8020 //
8021 // Registers z3-z18 will hold as-stored values (with inactive elements
8022 // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8023 // loaded.
8024
8025 __ Index(z10.VnB(), 1, -7);
8026 __ Index(z11.VnB(), 2, -7);
8027 __ Index(z12.VnB(), 3, -7);
8028 __ Index(z13.VnB(), 4, -7);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008029 __ Ptrue(p0.VnB());
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008030 __ St4b(z10.VnB(), z11.VnB(), z12.VnB(), z13.VnB(), p0, SVEMemOperand(x0));
8031 // Save the stored values for ld4 tests.
8032 __ Dup(z3.VnB(), 0);
8033 __ Dup(z4.VnB(), 0);
8034 __ Dup(z5.VnB(), 0);
8035 __ Dup(z6.VnB(), 0);
8036 __ Mov(z3.VnB(), p0.Merging(), z10.VnB());
8037 __ Mov(z4.VnB(), p0.Merging(), z11.VnB());
8038 __ Mov(z5.VnB(), p0.Merging(), z12.VnB());
8039 __ Mov(z6.VnB(), p0.Merging(), z13.VnB());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008040
8041 // Wrap around from z31 to z0.
8042 __ Index(z31.VnH(), -2, 5);
8043 __ Index(z0.VnH(), -3, 5);
8044 __ Index(z1.VnH(), -4, 5);
8045 __ Index(z2.VnH(), -5, 5);
8046 __ Ptrue(p1.VnH(), SVE_MUL3);
8047 __ St4h(z31.VnH(),
8048 z0.VnH(),
8049 z1.VnH(),
8050 z2.VnH(),
8051 p1,
8052 SVEMemOperand(x0, 4, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008053 // Save the stored values for ld4 tests.
8054 __ Dup(z7.VnH(), 0);
8055 __ Dup(z8.VnH(), 0);
8056 __ Dup(z9.VnH(), 0);
8057 __ Dup(z10.VnH(), 0);
8058 __ Mov(z7.VnH(), p1.Merging(), z31.VnH());
8059 __ Mov(z8.VnH(), p1.Merging(), z0.VnH());
8060 __ Mov(z9.VnH(), p1.Merging(), z1.VnH());
8061 __ Mov(z10.VnH(), p1.Merging(), z2.VnH());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008062
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008063 // Wrap around from z31 to z0.
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008064 __ Index(z29.VnS(), 2, -7);
8065 __ Index(z30.VnS(), 3, -7);
8066 __ Index(z31.VnS(), 4, -7);
8067 __ Index(z0.VnS(), 5, -7);
8068 __ Ptrue(p2.VnS(), SVE_POW2);
8069 __ St4w(z29.VnS(),
8070 z30.VnS(),
8071 z31.VnS(),
8072 z0.VnS(),
8073 p2,
8074 SVEMemOperand(x0, -12, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008075 // Save the stored values for ld4 tests.
8076 __ Dup(z11.VnS(), 0);
8077 __ Dup(z12.VnS(), 0);
8078 __ Dup(z13.VnS(), 0);
8079 __ Dup(z14.VnS(), 0);
8080 __ Mov(z11.VnS(), p2.Merging(), z29.VnS());
8081 __ Mov(z12.VnS(), p2.Merging(), z30.VnS());
8082 __ Mov(z13.VnS(), p2.Merging(), z31.VnS());
8083 __ Mov(z14.VnS(), p2.Merging(), z0.VnS());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008084
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008085 __ Index(z20.VnD(), -7, 8);
8086 __ Index(z21.VnD(), -8, 8);
8087 __ Index(z22.VnD(), -9, 8);
8088 __ Index(z23.VnD(), -10, 8);
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008089 // Sparse predication, including some irrelevant bits (0xee). To make the
8090 // results easy to check, activate each lane <n> where n is a multiple of 5.
8091 Initialise(&masm,
8092 p3,
8093 0xeee10000000001ee,
8094 0xeeeeeee100000000,
8095 0x01eeeeeeeee10000,
8096 0x000001eeeeeeeee1);
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008097 __ St4d(z20.VnD(),
8098 z21.VnD(),
8099 z22.VnD(),
8100 z23.VnD(),
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008101 p3,
8102 SVEMemOperand(x0, 16, SVE_MUL_VL));
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008103 // Save the stored values for ld4 tests.
8104 __ Dup(z15.VnD(), 0);
8105 __ Dup(z16.VnD(), 0);
8106 __ Dup(z17.VnD(), 0);
8107 __ Dup(z18.VnD(), 0);
8108 __ Mov(z15.VnD(), p3.Merging(), z20.VnD());
8109 __ Mov(z16.VnD(), p3.Merging(), z21.VnD());
8110 __ Mov(z17.VnD(), p3.Merging(), z22.VnD());
8111 __ Mov(z18.VnD(), p3.Merging(), z23.VnD());
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008112
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008113 // Corresponding loads.
8114 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8115 __ Ld4b(z31.VnB(),
8116 z0.VnB(),
8117 z1.VnB(),
8118 z2.VnB(),
8119 p0.Zeroing(),
8120 SVEMemOperand(x0));
8121 __ Mov(z19, z31);
8122 __ Mov(z20, z0);
8123 __ Mov(z21, z1);
8124 __ Mov(z22, z2);
8125 __ Ld4h(z23.VnH(),
8126 z24.VnH(),
8127 z25.VnH(),
8128 z26.VnH(),
8129 p1.Zeroing(),
8130 SVEMemOperand(x0, 4, SVE_MUL_VL));
8131 __ Ld4w(z27.VnS(),
8132 z28.VnS(),
8133 z29.VnS(),
8134 z30.VnS(),
8135 p2.Zeroing(),
8136 SVEMemOperand(x0, -12, SVE_MUL_VL));
8137 // Wrap around from z31 to z0.
8138 __ Ld4d(z31.VnD(),
8139 z0.VnD(),
8140 z1.VnD(),
8141 z2.VnD(),
8142 p3.Zeroing(),
8143 SVEMemOperand(x0, 16, SVE_MUL_VL));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008144
8145 END();
8146
8147 if (CAN_RUN()) {
8148 RUN();
8149
8150 uint8_t* expected = new uint8_t[data_size];
8151 memset(expected, 0, data_size);
8152 uint8_t* middle = &expected[data_size / 2];
8153
8154 int vl_b = vl / kBRegSizeInBytes;
8155 int vl_h = vl / kHRegSizeInBytes;
8156 int vl_s = vl / kSRegSizeInBytes;
8157 int vl_d = vl / kDRegSizeInBytes;
8158
8159 int reg_count = 4;
8160
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008161 // st2b { z10.b, z11.b, z12.b, z13.b }, SVE_ALL
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008162 for (int i = 0; i < vl_b; i++) {
8163 uint8_t lane0 = 1 - (7 * i);
8164 uint8_t lane1 = 2 - (7 * i);
8165 uint8_t lane2 = 3 - (7 * i);
8166 uint8_t lane3 = 4 - (7 * i);
8167 MemoryWrite(middle, 0, (i * reg_count) + 0, lane0);
8168 MemoryWrite(middle, 0, (i * reg_count) + 1, lane1);
8169 MemoryWrite(middle, 0, (i * reg_count) + 2, lane2);
8170 MemoryWrite(middle, 0, (i * reg_count) + 3, lane3);
8171 }
8172
8173 // st4h { z31.h, z0.h, z1.h, z2.h }, SVE_MUL3
8174 int vl_h_mul3 = vl_h - (vl_h % 3);
8175 for (int i = 0; i < vl_h_mul3; i++) {
8176 int64_t offset = 4 * vl;
8177 uint16_t lane0 = -2 + (5 * i);
8178 uint16_t lane1 = -3 + (5 * i);
8179 uint16_t lane2 = -4 + (5 * i);
8180 uint16_t lane3 = -5 + (5 * i);
8181 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8182 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8183 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8184 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8185 }
8186
8187 // st4w { z29.s, z30.s, z31.s, z0.s }, SVE_POW2
8188 int vl_s_pow2 = 1 << HighestSetBitPosition(vl_s);
8189 for (int i = 0; i < vl_s_pow2; i++) {
8190 int64_t offset = -12 * vl;
8191 uint32_t lane0 = 2 - (7 * i);
8192 uint32_t lane1 = 3 - (7 * i);
8193 uint32_t lane2 = 4 - (7 * i);
8194 uint32_t lane3 = 5 - (7 * i);
8195 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8196 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8197 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8198 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8199 }
8200
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008201 // st4d { z20.d, z21.d, z22.d, z23.d }, ((i % 5) == 0)
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008202 for (int i = 0; i < vl_d; i++) {
8203 if ((i % 5) == 0) {
8204 int64_t offset = 16 * vl;
8205 uint64_t lane0 = -7 + (8 * i);
8206 uint64_t lane1 = -8 + (8 * i);
8207 uint64_t lane2 = -9 + (8 * i);
8208 uint64_t lane3 = -10 + (8 * i);
8209 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8210 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8211 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8212 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8213 }
8214 }
8215
8216 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8217
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008218 // Check that we loaded back the expected values.
8219
8220 // st4b/ld4b
8221 ASSERT_EQUAL_SVE(z3, z19);
8222 ASSERT_EQUAL_SVE(z4, z20);
8223 ASSERT_EQUAL_SVE(z5, z21);
8224 ASSERT_EQUAL_SVE(z6, z22);
8225
8226 // st4h/ld4h
8227 ASSERT_EQUAL_SVE(z7, z23);
8228 ASSERT_EQUAL_SVE(z8, z24);
8229 ASSERT_EQUAL_SVE(z9, z25);
8230 ASSERT_EQUAL_SVE(z10, z26);
8231
8232 // st4w/ld4w
8233 ASSERT_EQUAL_SVE(z11, z27);
8234 ASSERT_EQUAL_SVE(z12, z28);
8235 ASSERT_EQUAL_SVE(z13, z29);
8236 ASSERT_EQUAL_SVE(z14, z30);
8237
8238 // st4d/ld4d
8239 ASSERT_EQUAL_SVE(z15, z31);
8240 ASSERT_EQUAL_SVE(z16, z0);
8241 ASSERT_EQUAL_SVE(z17, z1);
8242 ASSERT_EQUAL_SVE(z18, z2);
8243
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008244 delete[] expected;
8245 }
8246 delete[] data;
8247}
8248
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008249TEST_SVE(sve_ld4_st4_scalar_plus_scalar) {
8250 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8251 START();
8252
8253 int vl = config->sve_vl_in_bytes();
8254
8255 // Allocate plenty of space to enable indexing in both directions.
8256 int data_size = vl * 128;
8257
8258 uint8_t* data = new uint8_t[data_size];
8259 memset(data, 0, data_size);
8260
Josh Sorefb43d6ef2022-08-03 12:47:14 -04008261 // Set the base half-way through the buffer so we can use negative indices.
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008262 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
8263
Jacob Bramleye483ce52019-11-05 16:52:29 +00008264 // We can test ld4 by comparing the values loaded with the values stored.
8265 // There are two complications:
8266 // - Loads have zeroing predication, so we have to clear the inactive
8267 // elements on our reference.
8268 // - We want to test both loads and stores that span { z31, z0 }, so we have
8269 // to move some values around.
8270 //
8271 // Registers z3-z18 will hold as-stored values (with inactive elements
8272 // cleared). Registers z19-z31 and z0-z2 will hold the values that were
8273 // loaded.
8274
8275 __ Index(z19.VnB(), -4, 11);
8276 __ Index(z20.VnB(), -5, 11);
8277 __ Index(z21.VnB(), -6, 11);
8278 __ Index(z22.VnB(), -7, 11);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008279 __ Ptrue(p7.VnB(), SVE_MUL4);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008280 __ Rdvl(x1, -1); // Make offsets VL-dependent so we can avoid overlap.
8281 __ St4b(z19.VnB(),
8282 z20.VnB(),
8283 z21.VnB(),
8284 z22.VnB(),
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008285 p7,
8286 SVEMemOperand(x0, x1, LSL, 0));
Jacob Bramleye483ce52019-11-05 16:52:29 +00008287 // Save the stored values for ld4 tests.
8288 __ Dup(z3.VnB(), 0);
8289 __ Dup(z4.VnB(), 0);
8290 __ Dup(z5.VnB(), 0);
8291 __ Dup(z6.VnB(), 0);
8292 __ Mov(z3.VnB(), p7.Merging(), z19.VnB());
8293 __ Mov(z4.VnB(), p7.Merging(), z20.VnB());
8294 __ Mov(z5.VnB(), p7.Merging(), z21.VnB());
8295 __ Mov(z6.VnB(), p7.Merging(), z22.VnB());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008296
Jacob Bramleye483ce52019-11-05 16:52:29 +00008297 __ Index(z23.VnH(), 6, -2);
8298 __ Index(z24.VnH(), 7, -2);
8299 __ Index(z25.VnH(), 8, -2);
8300 __ Index(z26.VnH(), 9, -2);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008301 __ Ptrue(p6.VnH(), SVE_VL16);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008302 __ Rdvl(x2, 7); // (7 * vl) << 1 = 14 * vl
8303 __ St4h(z23.VnH(),
8304 z24.VnH(),
8305 z25.VnH(),
8306 z26.VnH(),
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008307 p6,
8308 SVEMemOperand(x0, x2, LSL, 1));
Jacob Bramleye483ce52019-11-05 16:52:29 +00008309 // Save the stored values for ld4 tests.
8310 __ Dup(z7.VnH(), 0);
8311 __ Dup(z8.VnH(), 0);
8312 __ Dup(z9.VnH(), 0);
8313 __ Dup(z10.VnH(), 0);
8314 __ Mov(z7.VnH(), p6.Merging(), z23.VnH());
8315 __ Mov(z8.VnH(), p6.Merging(), z24.VnH());
8316 __ Mov(z9.VnH(), p6.Merging(), z25.VnH());
8317 __ Mov(z10.VnH(), p6.Merging(), z26.VnH());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008318
8319 // Wrap around from z31 to z0.
8320 __ Index(z29.VnS(), -6, 7);
8321 __ Index(z30.VnS(), -7, 7);
8322 __ Index(z31.VnS(), -8, 7);
8323 __ Index(z0.VnS(), -9, 7);
8324 // Sparse predication, including some irrelevant bits (0xe). To make the
8325 // results easy to check, activate each lane <n> where n is a multiple of 5.
8326 Initialise(&masm,
8327 p5,
8328 0xeee1000010000100,
8329 0x001eeee100001000,
8330 0x0100001eeee10000,
8331 0x10000100001eeee1);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008332 __ Rdvl(x3, -5); // -(5 * vl) << 2 = -20 * vl
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008333 __ St4w(z29.VnS(),
8334 z30.VnS(),
8335 z31.VnS(),
8336 z0.VnS(),
8337 p5,
Jacob Bramleye483ce52019-11-05 16:52:29 +00008338 SVEMemOperand(x0, x3, LSL, 2));
8339 // Save the stored values for ld4 tests.
8340 __ Dup(z11.VnS(), 0);
8341 __ Dup(z12.VnS(), 0);
8342 __ Dup(z13.VnS(), 0);
8343 __ Dup(z14.VnS(), 0);
8344 __ Mov(z11.VnS(), p5.Merging(), z29.VnS());
8345 __ Mov(z12.VnS(), p5.Merging(), z30.VnS());
8346 __ Mov(z13.VnS(), p5.Merging(), z31.VnS());
8347 __ Mov(z14.VnS(), p5.Merging(), z0.VnS());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008348
8349 __ Index(z31.VnD(), 32, -11);
8350 __ Index(z0.VnD(), 33, -11);
8351 __ Index(z1.VnD(), 34, -11);
8352 __ Index(z2.VnD(), 35, -11);
8353 __ Ptrue(p4.VnD(), SVE_MUL3);
Jacob Bramleye483ce52019-11-05 16:52:29 +00008354 __ Rdvl(x4, -1); // -(1 * vl) << 3 = -8 *vl
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008355 __ St4d(z31.VnD(),
8356 z0.VnD(),
8357 z1.VnD(),
8358 z2.VnD(),
8359 p4,
Jacob Bramleye483ce52019-11-05 16:52:29 +00008360 SVEMemOperand(x0, x4, LSL, 3));
8361 // Save the stored values for ld4 tests.
8362 __ Dup(z15.VnD(), 0);
8363 __ Dup(z16.VnD(), 0);
8364 __ Dup(z17.VnD(), 0);
8365 __ Dup(z18.VnD(), 0);
8366 __ Mov(z15.VnD(), p4.Merging(), z31.VnD());
8367 __ Mov(z16.VnD(), p4.Merging(), z0.VnD());
8368 __ Mov(z17.VnD(), p4.Merging(), z1.VnD());
8369 __ Mov(z18.VnD(), p4.Merging(), z2.VnD());
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008370
Jacob Bramleye483ce52019-11-05 16:52:29 +00008371 // Corresponding loads.
8372 // Wrap around from z31 to z0, moving the results elsewhere to avoid overlap.
8373 __ Ld4b(z31.VnB(),
8374 z0.VnB(),
8375 z1.VnB(),
8376 z2.VnB(),
8377 p7.Zeroing(),
8378 SVEMemOperand(x0, x1, LSL, 0));
8379 __ Mov(z19, z31);
8380 __ Mov(z20, z0);
8381 __ Mov(z21, z1);
8382 __ Mov(z22, z2);
8383 __ Ld4h(z23.VnH(),
8384 z24.VnH(),
8385 z25.VnH(),
8386 z26.VnH(),
8387 p6.Zeroing(),
8388 SVEMemOperand(x0, x2, LSL, 1));
8389 __ Ld4w(z27.VnS(),
8390 z28.VnS(),
8391 z29.VnS(),
8392 z30.VnS(),
8393 p5.Zeroing(),
8394 SVEMemOperand(x0, x3, LSL, 2));
8395 // Wrap around from z31 to z0.
8396 __ Ld4d(z31.VnD(),
8397 z0.VnD(),
8398 z1.VnD(),
8399 z2.VnD(),
8400 p4.Zeroing(),
8401 SVEMemOperand(x0, x4, LSL, 3));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008402
8403 END();
8404
8405 if (CAN_RUN()) {
8406 RUN();
8407
8408 uint8_t* expected = new uint8_t[data_size];
8409 memset(expected, 0, data_size);
8410 uint8_t* middle = &expected[data_size / 2];
8411
8412 int vl_b = vl / kBRegSizeInBytes;
8413 int vl_h = vl / kHRegSizeInBytes;
8414 int vl_s = vl / kSRegSizeInBytes;
8415 int vl_d = vl / kDRegSizeInBytes;
8416
8417 int reg_count = 4;
8418
Jacob Bramleye483ce52019-11-05 16:52:29 +00008419 // st4b { z19.b, z20.b, z21.b, z22.b }, SVE_MUL4
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008420 int vl_b_mul4 = vl_b - (vl_b % 4);
8421 for (int i = 0; i < vl_b_mul4; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00008422 int64_t offset = -(1 << kBRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008423 uint8_t lane0 = -4 + (11 * i);
8424 uint8_t lane1 = -5 + (11 * i);
8425 uint8_t lane2 = -6 + (11 * i);
8426 uint8_t lane3 = -7 + (11 * i);
8427 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8428 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8429 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8430 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8431 }
8432
Jacob Bramleye483ce52019-11-05 16:52:29 +00008433 // st4h { z22.h, z23.h, z24.h, z25.h }, SVE_VL16
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008434 if (vl_h >= 16) {
8435 for (int i = 0; i < 16; i++) {
8436 int64_t offset = (7 << kHRegSizeInBytesLog2) * vl;
8437 uint16_t lane0 = 6 - (2 * i);
8438 uint16_t lane1 = 7 - (2 * i);
8439 uint16_t lane2 = 8 - (2 * i);
8440 uint16_t lane3 = 9 - (2 * i);
8441 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8442 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8443 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8444 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8445 }
8446 }
8447
8448 // st4w { z29.s, z30.s, z31.s, z0.s }, ((i % 5) == 0)
8449 for (int i = 0; i < vl_s; i++) {
8450 if ((i % 5) == 0) {
8451 int64_t offset = -(5 << kSRegSizeInBytesLog2) * vl;
8452 uint32_t lane0 = -6 + (7 * i);
8453 uint32_t lane1 = -7 + (7 * i);
8454 uint32_t lane2 = -8 + (7 * i);
8455 uint32_t lane3 = -9 + (7 * i);
8456 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8457 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8458 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8459 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8460 }
8461 }
8462
8463 // st4d { z31.d, z0.d, z1.d, z2.d }, SVE_MUL3
8464 int vl_d_mul3 = vl_d - (vl_d % 3);
8465 for (int i = 0; i < vl_d_mul3; i++) {
Jacob Bramleye483ce52019-11-05 16:52:29 +00008466 int64_t offset = -(1 << kDRegSizeInBytesLog2) * vl;
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008467 uint64_t lane0 = 32 - (11 * i);
8468 uint64_t lane1 = 33 - (11 * i);
8469 uint64_t lane2 = 34 - (11 * i);
8470 uint64_t lane3 = 35 - (11 * i);
8471 MemoryWrite(middle, offset, (i * reg_count) + 0, lane0);
8472 MemoryWrite(middle, offset, (i * reg_count) + 1, lane1);
8473 MemoryWrite(middle, offset, (i * reg_count) + 2, lane2);
8474 MemoryWrite(middle, offset, (i * reg_count) + 3, lane3);
8475 }
8476
8477 ASSERT_EQUAL_MEMORY(expected, data, data_size, middle - expected);
8478
Jacob Bramleye483ce52019-11-05 16:52:29 +00008479 // Check that we loaded back the expected values.
8480
8481 // st4b/ld4b
8482 ASSERT_EQUAL_SVE(z3, z19);
8483 ASSERT_EQUAL_SVE(z4, z20);
8484 ASSERT_EQUAL_SVE(z5, z21);
8485 ASSERT_EQUAL_SVE(z6, z22);
8486
8487 // st4h/ld4h
8488 ASSERT_EQUAL_SVE(z7, z23);
8489 ASSERT_EQUAL_SVE(z8, z24);
8490 ASSERT_EQUAL_SVE(z9, z25);
8491 ASSERT_EQUAL_SVE(z10, z26);
8492
8493 // st4w/ld4w
8494 ASSERT_EQUAL_SVE(z11, z27);
8495 ASSERT_EQUAL_SVE(z12, z28);
8496 ASSERT_EQUAL_SVE(z13, z29);
8497 ASSERT_EQUAL_SVE(z14, z30);
8498
8499 // st4d/ld4d
8500 ASSERT_EQUAL_SVE(z15, z31);
8501 ASSERT_EQUAL_SVE(z16, z0);
8502 ASSERT_EQUAL_SVE(z17, z1);
8503 ASSERT_EQUAL_SVE(z18, z2);
8504
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008505 delete[] expected;
8506 }
8507 delete[] data;
8508}
8509
8510TEST_SVE(sve_ld234_st234_scalar_plus_scalar_sp) {
8511 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8512 START();
8513
8514 // Check that the simulator correctly interprets rn == 31 as sp.
8515 // The indexing logic is the same regardless so we just check one load and
8516 // store of each type.
8517
8518 // There are no pre- or post-indexing modes, so reserve space first.
8519 __ ClaimVL(2 + 3 + 4);
8520
8521 __ Index(z0.VnB(), 42, 2);
8522 __ Index(z1.VnB(), 43, 2);
8523 __ Ptrue(p0.VnB(), SVE_VL7);
8524 __ Rdvl(x0, 0);
8525 __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, x0));
8526
8527 __ Index(z4.VnH(), 42, 3);
8528 __ Index(z5.VnH(), 43, 3);
8529 __ Index(z6.VnH(), 44, 3);
8530 __ Ptrue(p1.VnH(), SVE_POW2);
8531 __ Rdvl(x1, 2);
8532 __ Lsr(x1, x1, 1);
8533 __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, x1, LSL, 1));
8534
8535 __ Index(z8.VnS(), 42, 4);
8536 __ Index(z9.VnS(), 43, 4);
8537 __ Index(z10.VnS(), 44, 4);
8538 __ Index(z11.VnS(), 45, 4);
8539 __ Ptrue(p2.VnS());
8540 __ Rdvl(x2, 2 + 3);
8541 __ Lsr(x2, x2, 2);
8542 __ St4w(z8.VnS(),
8543 z9.VnS(),
8544 z10.VnS(),
8545 z11.VnS(),
8546 p2,
8547 SVEMemOperand(sp, x2, LSL, 2));
8548
Jacob Bramleye483ce52019-11-05 16:52:29 +00008549 // Corresponding loads.
8550 // We have to explicitly zero inactive lanes in the reference values because
8551 // loads have zeroing predication.
8552 __ Dup(z12.VnB(), 0);
8553 __ Dup(z13.VnB(), 0);
8554 __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8555 __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8556 __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, x0));
8557
8558 __ Dup(z16.VnH(), 0);
8559 __ Dup(z17.VnH(), 0);
8560 __ Dup(z18.VnH(), 0);
8561 __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8562 __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8563 __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8564 __ Ld3h(z4.VnH(),
8565 z5.VnH(),
8566 z6.VnH(),
8567 p1.Zeroing(),
8568 SVEMemOperand(sp, x1, LSL, 1));
8569
8570 __ Dup(z20.VnS(), 0);
8571 __ Dup(z21.VnS(), 0);
8572 __ Dup(z22.VnS(), 0);
8573 __ Dup(z23.VnS(), 0);
8574 __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8575 __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8576 __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8577 __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8578 __ Ld4w(z8.VnS(),
8579 z9.VnS(),
8580 z10.VnS(),
8581 z11.VnS(),
8582 p2.Zeroing(),
8583 SVEMemOperand(sp, x2, LSL, 2));
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008584
8585 __ DropVL(2 + 3 + 4);
8586
8587 END();
8588
8589 if (CAN_RUN()) {
8590 RUN();
8591
8592 // The most likely failure mode is the that simulator reads sp as xzr and
8593 // crashes on execution. We already test the address calculations separately
8594 // and sp doesn't change this, so just test that we load the values we
8595 // stored.
Jacob Bramleye483ce52019-11-05 16:52:29 +00008596
8597 // st2b/ld2b
8598 ASSERT_EQUAL_SVE(z0, z12);
8599 ASSERT_EQUAL_SVE(z1, z13);
8600
8601 // st3h/ld3h
8602 ASSERT_EQUAL_SVE(z4, z16);
8603 ASSERT_EQUAL_SVE(z5, z17);
8604 ASSERT_EQUAL_SVE(z6, z18);
8605
8606 // st4h/ld4h
8607 ASSERT_EQUAL_SVE(z8, z20);
8608 ASSERT_EQUAL_SVE(z9, z21);
8609 ASSERT_EQUAL_SVE(z10, z22);
8610 ASSERT_EQUAL_SVE(z11, z23);
Jacob Bramleybc4a54f2019-11-04 16:44:01 +00008611 }
8612}
8613
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008614TEST_SVE(sve_ld234_st234_scalar_plus_imm_sp) {
8615 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8616 START();
8617
8618 // Check that the simulator correctly interprets rn == 31 as sp.
8619 // The indexing logic is the same regardless so we just check one load and
8620 // store of each type.
8621
8622 // There are no pre- or post-indexing modes, so reserve space first.
8623 // Note that the stores fill in an order that allows each immediate to be a
8624 // multiple of the number of registers.
8625 __ ClaimVL(4 + 2 + 3);
8626
8627 __ Index(z0.VnB(), 42, 2);
8628 __ Index(z1.VnB(), 43, 2);
8629 __ Ptrue(p0.VnB(), SVE_POW2);
8630 __ St2b(z0.VnB(), z1.VnB(), p0, SVEMemOperand(sp, 4, SVE_MUL_VL));
8631
8632 __ Index(z4.VnH(), 42, 3);
8633 __ Index(z5.VnH(), 43, 3);
8634 __ Index(z6.VnH(), 44, 3);
8635 __ Ptrue(p1.VnH(), SVE_VL7);
8636 __ St3h(z4.VnH(), z5.VnH(), z6.VnH(), p1, SVEMemOperand(sp, 6, SVE_MUL_VL));
8637
8638 __ Index(z8.VnS(), 42, 4);
8639 __ Index(z9.VnS(), 43, 4);
8640 __ Index(z10.VnS(), 44, 4);
8641 __ Index(z11.VnS(), 45, 4);
8642 __ Ptrue(p2.VnS());
8643 __ St4w(z8.VnS(), z9.VnS(), z10.VnS(), z11.VnS(), p2, SVEMemOperand(sp));
8644
Jacob Bramleye5ab0fe2019-11-05 16:52:29 +00008645 // Corresponding loads.
8646 // We have to explicitly zero inactive lanes in the reference values because
8647 // loads have zeroing predication.
8648 __ Dup(z12.VnB(), 0);
8649 __ Dup(z13.VnB(), 0);
8650 __ Mov(z12.VnB(), p0.Merging(), z0.VnB());
8651 __ Mov(z13.VnB(), p0.Merging(), z1.VnB());
8652 __ Ld2b(z0.VnB(), z1.VnB(), p0.Zeroing(), SVEMemOperand(sp, 4, SVE_MUL_VL));
8653
8654 __ Dup(z16.VnH(), 0);
8655 __ Dup(z17.VnH(), 0);
8656 __ Dup(z18.VnH(), 0);
8657 __ Mov(z16.VnH(), p1.Merging(), z4.VnH());
8658 __ Mov(z17.VnH(), p1.Merging(), z5.VnH());
8659 __ Mov(z18.VnH(), p1.Merging(), z6.VnH());
8660 __ Ld3h(z4.VnH(),
8661 z5.VnH(),
8662 z6.VnH(),
8663 p1.Zeroing(),
8664 SVEMemOperand(sp, 6, SVE_MUL_VL));
8665
8666 __ Dup(z20.VnS(), 0);
8667 __ Dup(z21.VnS(), 0);
8668 __ Dup(z22.VnS(), 0);
8669 __ Dup(z23.VnS(), 0);
8670 __ Mov(z20.VnS(), p2.Merging(), z8.VnS());
8671 __ Mov(z21.VnS(), p2.Merging(), z9.VnS());
8672 __ Mov(z22.VnS(), p2.Merging(), z10.VnS());
8673 __ Mov(z23.VnS(), p2.Merging(), z11.VnS());
8674 __ Ld4w(z8.VnS(),
8675 z9.VnS(),
8676 z10.VnS(),
8677 z11.VnS(),
8678 p2.Zeroing(),
8679 SVEMemOperand(sp));
Jacob Bramleyd4dd9c22019-11-04 16:44:01 +00008680
8681 __ DropVL(4 + 2 + 3);
8682
8683 END();
8684
8685 if (CAN_RUN()) {
8686 RUN();
8687
8688 // The most likely failure mode is the that simulator reads sp as xzr and
8689 // crashes on execution. We already test the address calculations separately
8690 // and sp doesn't change this, so just test that we load the values we
8691 // stored.
8692 // TODO: Actually do this, once loads are implemented.
8693 }
8694}
8695
TatWai Chong85e15102020-05-04 21:00:40 -07008696// Fill the input buffer with arbitrary data. Meanwhile, assign random offsets
8697// from the base address of the buffer and corresponding addresses to the
8698// arguments if provided.
8699static void BufferFillingHelper(uint64_t data_ptr,
8700 size_t buffer_size,
8701 unsigned lane_size_in_bytes,
8702 int lane_count,
8703 uint64_t* offsets,
8704 uint64_t* addresses = nullptr,
8705 uint64_t* max_address = nullptr) {
8706 // Use a fixed seed for nrand48() so that test runs are reproducible.
mmc28a1a2c1d32024-02-01 16:43:49 +00008707 unsigned short seed[3] = {1, 2, 3}; // NOLINT(google-runtime-int)
TatWai Chong85e15102020-05-04 21:00:40 -07008708
8709 // Fill a buffer with arbitrary data.
8710 for (size_t i = 0; i < buffer_size; i++) {
8711 uint8_t byte = nrand48(seed) & 0xff;
8712 memcpy(reinterpret_cast<void*>(data_ptr + i), &byte, 1);
8713 }
8714
8715 if (max_address != nullptr) {
8716 *max_address = 0;
8717 }
8718
8719 // Vectors of random addresses and offsets into the buffer.
8720 for (int i = 0; i < lane_count; i++) {
8721 uint64_t rnd = nrand48(seed);
8722 // Limit the range to the set of completely-accessible elements in memory.
8723 offsets[i] = rnd % (buffer_size - lane_size_in_bytes);
8724 if ((addresses != nullptr) && (max_address != nullptr)) {
8725 addresses[i] = data_ptr + offsets[i];
8726 *max_address = std::max(*max_address, addresses[i]);
8727 }
8728 }
8729}
8730
TatWai Chong85e15102020-05-04 21:00:40 -07008731static void ScalarLoadHelper(MacroAssembler* masm,
8732 Register dst,
8733 Register addr,
8734 int msize_in_bits,
8735 bool is_signed) {
8736 if (is_signed) {
8737 switch (msize_in_bits) {
8738 case kBRegSize:
8739 masm->Ldrsb(dst, MemOperand(addr));
8740 break;
8741 case kHRegSize:
8742 masm->Ldrsh(dst, MemOperand(addr));
8743 break;
8744 case kWRegSize:
8745 masm->Ldrsw(dst, MemOperand(addr));
8746 break;
8747 default:
8748 VIXL_UNIMPLEMENTED();
8749 break;
8750 }
8751 } else {
8752 switch (msize_in_bits) {
8753 case kBRegSize:
8754 masm->Ldrb(dst, MemOperand(addr));
8755 break;
8756 case kHRegSize:
8757 masm->Ldrh(dst, MemOperand(addr));
8758 break;
8759 case kWRegSize:
8760 masm->Ldr(dst.W(), MemOperand(addr));
8761 break;
8762 case kXRegSize:
8763 masm->Ldr(dst, MemOperand(addr));
8764 break;
8765 default:
8766 VIXL_UNIMPLEMENTED();
8767 break;
8768 }
8769 }
8770}
8771
8772// Generate a reference result using scalar loads.
8773// For now this helper doesn't save and restore the caller registers.
8774// Clobber register z30, x28, x29 and p7.
8775template <size_t N>
8776static void ScalarLoadHelper(MacroAssembler* masm,
8777 int vl,
8778 const uint64_t (&addresses)[N],
8779 const ZRegister& zt_ref,
8780 const PRegisterZ& pg,
8781 unsigned esize_in_bits,
8782 unsigned msize_in_bits,
8783 bool is_signed) {
8784 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8785 ZRegister lane_numbers = z30.WithLaneSize(esize_in_bits);
8786 masm->Index(lane_numbers, 0, 1);
8787 masm->Dup(zt_ref, 0);
8788 for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
8789 masm->Mov(x29, addresses[N - i - 1]);
8790 Register rt(28, std::min(std::max(esize_in_bits, kSRegSize), kDRegSize));
8791 ScalarLoadHelper(masm, rt, x29, msize_in_bits, is_signed);
8792
8793 // Emulate predication.
8794 masm->Cmpeq(p7.WithLaneSize(esize_in_bits), pg, lane_numbers, i);
8795 masm->Cpy(zt_ref, p7.Merging(), rt);
8796 }
8797}
8798
TatWai Chong113d9192020-05-19 01:02:36 -07008799typedef void (MacroAssembler::*Ld1Macro)(const ZRegister& zt,
8800 const PRegisterZ& pg,
8801 const SVEMemOperand& addr);
8802
Martyn Capewella5112342020-06-05 18:20:11 +01008803template <typename T>
TatWai Chong6537a9a2020-05-05 14:15:16 -07008804static void Ldff1Helper(Test* config,
8805 uintptr_t data,
8806 unsigned msize_in_bits,
8807 unsigned esize_in_bits,
TatWai Chong1af34f12020-06-01 20:54:06 -07008808 CPURegister::RegisterType base_type,
TatWai Chong6537a9a2020-05-05 14:15:16 -07008809 Ld1Macro ldff1,
8810 Ld1Macro ld1,
Martyn Capewella5112342020-06-05 18:20:11 +01008811 T mod,
TatWai Chong6537a9a2020-05-05 14:15:16 -07008812 bool scale = false) {
8813 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
8814 START();
8815
8816 int vl = config->sve_vl_in_bytes();
8817 size_t page_size = sysconf(_SC_PAGE_SIZE);
8818 VIXL_ASSERT(page_size > static_cast<size_t>(vl));
8819
8820 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
8821 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
8822 unsigned msize_in_bytes_log2 = std::log2(msize_in_bytes);
8823 VIXL_ASSERT(msize_in_bits <= esize_in_bits);
8824
8825 PRegister all = p7;
8826 __ Ptrue(all.VnB());
8827
8828 size_t offset_modifier = 0;
8829
Martyn Capewell5f9b3802020-03-24 16:16:36 +00008830 // The highest address at which a load stopped. Every FF load should fault at
TatWai Chong6537a9a2020-05-05 14:15:16 -07008831 // `data + page_size`, so this value should not exceed that value. However,
8832 // the architecture allows fault-tolerant loads to fault arbitrarily, so the
8833 // real value may be lower.
8834 //
8835 // This is used to check that the `mprotect` above really does make the second
8836 // page inaccessible, and that the resulting FFR from each load reflects that.
8837 Register limit = x22;
8838 __ Mov(limit, 0);
8839
8840 // If the FFR grows unexpectedly, we increment this register by the
8841 // difference. FFR should never grow, except when explicitly set.
8842 Register ffr_grow_count = x23;
8843 __ Mov(ffr_grow_count, 0);
8844
8845 // Set the offset so that the load is guaranteed to start in the
8846 // accessible page, but end in the inaccessible one.
8847 VIXL_ASSERT((page_size % msize_in_bytes) == 0);
8848 VIXL_ASSERT((vl % msize_in_bytes) == 0);
8849 size_t elements_per_page = page_size / msize_in_bytes;
8850 size_t elements_per_access = vl / esize_in_bytes;
8851 size_t min_offset = (elements_per_page - elements_per_access) + 1;
8852 size_t max_offset = elements_per_page - 1;
8853 size_t offset =
8854 min_offset + (offset_modifier % (max_offset - min_offset + 1));
8855 offset_modifier++;
8856
8857 __ Setffr();
8858 __ Mov(x20, data);
8859 __ Mov(x21, offset);
8860
TatWai Chong1af34f12020-06-01 20:54:06 -07008861 if (base_type == CPURegister::kRegister) {
8862 // Scalar-plus-scalar mode.
Martyn Capewella5112342020-06-05 18:20:11 +01008863 VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8864 VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8865 (static_cast<int>(mod) == NO_SHIFT));
8866 (masm.*ldff1)(z0.WithLaneSize(esize_in_bits),
8867 all.Zeroing(),
8868 SVEMemOperand(x20, x21, mod, msize_in_bytes_log2));
8869 } else {
8870 VIXL_ASSERT(base_type == CPURegister::kZRegister);
TatWai Chong1af34f12020-06-01 20:54:06 -07008871 int offs_size;
8872 bool offs_is_unsigned;
Martyn Capewella5112342020-06-05 18:20:11 +01008873 if (std::is_same<T, vixl::aarch64::Extend>::value) {
TatWai Chong1af34f12020-06-01 20:54:06 -07008874 // Scalar-plus-vector mode with 32-bit optional unpacked or upacked, and
8875 // unscaled or scaled offset.
Martyn Capewella5112342020-06-05 18:20:11 +01008876 VIXL_ASSERT((static_cast<int>(mod) == SXTW) ||
8877 (static_cast<int>(mod) == UXTW));
TatWai Chong1af34f12020-06-01 20:54:06 -07008878 if (scale == true) {
8879 // Gather first-fault bytes load doesn't support scaled offset.
8880 VIXL_ASSERT(msize_in_bits != kBRegSize);
8881 }
Martyn Capewella5112342020-06-05 18:20:11 +01008882 offs_is_unsigned = (static_cast<int>(mod) == UXTW) ? true : false;
TatWai Chong1af34f12020-06-01 20:54:06 -07008883 offs_size = kSRegSize;
8884
8885 } else {
8886 // Scalar-plus-vector mode with 64-bit unscaled or scaled offset.
Martyn Capewella5112342020-06-05 18:20:11 +01008887 VIXL_ASSERT((std::is_same<T, vixl::aarch64::Shift>::value));
8888 VIXL_ASSERT((static_cast<int>(mod) == LSL) ||
8889 (static_cast<int>(mod) == NO_SHIFT));
TatWai Chong1af34f12020-06-01 20:54:06 -07008890 offs_is_unsigned = false;
8891 offs_size = kDRegSize;
8892 }
8893
TatWai Chong6537a9a2020-05-05 14:15:16 -07008894 // For generating the pattern of "base address + index << shift".
8895 // In case of unscaled-offset operation, use `msize_in_bytes` be an offset
8896 // of each decreasing memory accesses. otherwise, decreases the indexes by 1
8897 // and then scale it by the shift value.
8898 int shift = (scale == true) ? msize_in_bytes_log2 : 0;
8899 int index_offset = msize_in_bytes >> shift;
8900 VIXL_ASSERT(index_offset > 0);
TatWai Chong6537a9a2020-05-05 14:15:16 -07008901 uint64_t index = 0;
8902 uint64_t base_address = 0;
8903
TatWai Chong1af34f12020-06-01 20:54:06 -07008904 if (offs_is_unsigned == true) {
TatWai Chong6537a9a2020-05-05 14:15:16 -07008905 // Base address.
8906 base_address = data;
8907 // Maximum unsigned positive index.
8908 index = page_size >> shift;
8909
8910 } else {
8911 // Base address.
8912 base_address = data + (2 * page_size);
8913 // Maximum unsigned positive index.
8914 uint64_t uint_e_max =
8915 (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
8916 index = uint_e_max - (page_size >> shift) + 1;
8917 }
8918
8919 __ Mov(x19, base_address);
8920 if ((offs_size == kSRegSize) && (esize_in_bits == kDRegSize)) {
8921 // In this case, the index values are optionally sign or zero-extended
8922 // from 32 to 64 bits, assign a convenient value to the top 32 bits to
8923 // ensure only the low 32 bits be the index values.
8924 index |= 0x1234567800000000;
8925 }
8926
8927 index -= index_offset * (elements_per_access - 1);
8928 __ Index(z17.WithLaneSize(esize_in_bits), index, index_offset);
8929
8930 // Scalar plus vector mode.
8931 (masm.*
8932 ldff1)(z0.WithLaneSize(esize_in_bits),
8933 all.Zeroing(),
8934 SVEMemOperand(x19, z17.WithLaneSize(esize_in_bits), mod, shift));
8935 }
8936
8937 __ Rdffrs(p0.VnB(), all.Zeroing());
8938
8939 // Execute another Ldff1 with no offset, so that every element could be
8940 // read. It should respect FFR, and load no more than we loaded the
8941 // first time.
8942 (masm.*
8943 ldff1)(z16.WithLaneSize(esize_in_bits), all.Zeroing(), SVEMemOperand(x20));
8944 __ Rdffrs(p1.VnB(), all.Zeroing());
8945 __ Cntp(x0, all, p1.VnB());
8946 __ Uqdecp(x0, p0.VnB());
8947 __ Add(ffr_grow_count, ffr_grow_count, x0);
8948
8949 // Use the FFR to predicate the normal load. If it wasn't properly set,
8950 // the normal load will abort.
8951 (masm.*ld1)(z16.WithLaneSize(esize_in_bits),
8952 p0.Zeroing(),
8953 SVEMemOperand(x20, x21, LSL, msize_in_bytes_log2));
8954
8955 // Work out the address after the one that was just accessed.
8956 __ Incp(x21, p0.WithLaneSize(esize_in_bits));
8957 __ Add(x0, x20, Operand(x21, LSL, msize_in_bytes_log2));
8958 __ Cmp(limit, x0);
8959 __ Csel(limit, limit, x0, hs);
8960
8961 // Clear lanes inactive in FFR. These have an undefined result.
Martyn Capewella24d95c2020-05-20 11:11:15 +01008962 __ Not(p0.VnB(), all.Zeroing(), p0.VnB());
Martyn Capewelle2de6072020-05-22 09:52:06 +01008963 __ Mov(z0.WithLaneSize(esize_in_bits), p0.Merging(), 0);
TatWai Chong6537a9a2020-05-05 14:15:16 -07008964
8965 END();
8966
8967 if (CAN_RUN()) {
8968 RUN();
8969
8970 uintptr_t expected_limit = data + page_size;
8971 uintptr_t measured_limit = core.xreg(limit.GetCode());
8972 VIXL_CHECK(measured_limit <= expected_limit);
8973 if (measured_limit < expected_limit) {
8974 // We can't fail the test for this case, but a warning is helpful for
8975 // manually-run tests.
8976 printf(
8977 "WARNING: All fault-tolerant loads detected faults before the\n"
8978 "expected limit. This is architecturally possible, but improbable,\n"
8979 "and could be a symptom of another problem.\n");
8980 }
8981
8982 ASSERT_EQUAL_64(0, ffr_grow_count);
8983
8984 ASSERT_EQUAL_SVE(z0.WithLaneSize(esize_in_bits),
8985 z16.WithLaneSize(esize_in_bits));
8986 }
8987}
8988
8989TEST_SVE(sve_ldff1_scalar_plus_scalar) {
8990 size_t page_size = sysconf(_SC_PAGE_SIZE);
8991 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
8992
8993 // Allocate two pages, then mprotect the second one to make it inaccessible.
8994 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
8995 page_size * 2,
8996 PROT_READ | PROT_WRITE,
8997 MAP_PRIVATE | MAP_ANONYMOUS,
8998 -1,
8999 0));
9000 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9001
9002 // Fill the accessible page with arbitrary data.
9003 for (size_t i = 0; i < page_size; i++) {
9004 // Reverse bits so we get a mixture of positive and negative values.
9005 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9006 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9007 }
9008
Martyn Capewella5112342020-06-05 18:20:11 +01009009 auto ldff1_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009010 config,
9011 data,
9012 std::placeholders::_1,
9013 std::placeholders::_2,
9014 CPURegister::kRegister,
9015 std::placeholders::_3,
9016 std::placeholders::_4,
Martyn Capewella5112342020-06-05 18:20:11 +01009017 NO_SHIFT,
TatWai Chong1af34f12020-06-01 20:54:06 -07009018 false);
9019
TatWai Chong6537a9a2020-05-05 14:15:16 -07009020 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9021 Ld1Macro ld1b = &MacroAssembler::Ld1b;
TatWai Chong1af34f12020-06-01 20:54:06 -07009022 ldff1_unscaled_offset_helper(kBRegSize, kBRegSize, ldff1b, ld1b);
9023 ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1b, ld1b);
9024 ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1b, ld1b);
9025 ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1b, ld1b);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009026
9027 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9028 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
TatWai Chong1af34f12020-06-01 20:54:06 -07009029 ldff1_unscaled_offset_helper(kBRegSize, kHRegSize, ldff1sb, ld1sb);
9030 ldff1_unscaled_offset_helper(kBRegSize, kSRegSize, ldff1sb, ld1sb);
9031 ldff1_unscaled_offset_helper(kBRegSize, kDRegSize, ldff1sb, ld1sb);
9032
Martyn Capewella5112342020-06-05 18:20:11 +01009033 auto ldff1_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009034 config,
9035 data,
9036 std::placeholders::_1,
9037 std::placeholders::_2,
9038 CPURegister::kRegister,
9039 std::placeholders::_3,
9040 std::placeholders::_4,
Martyn Capewella5112342020-06-05 18:20:11 +01009041 LSL,
TatWai Chong1af34f12020-06-01 20:54:06 -07009042 true);
9043
9044 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9045 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9046 ldff1_scaled_offset_helper(kHRegSize, kHRegSize, ldff1h, ld1h);
9047 ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1h, ld1h);
9048 ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1h, ld1h);
9049
9050 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9051 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9052 ldff1_scaled_offset_helper(kSRegSize, kSRegSize, ldff1w, ld1w);
9053 ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1w, ld1w);
9054
9055 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9056 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9057 ldff1_scaled_offset_helper(kDRegSize, kDRegSize, ldff1d, ld1d);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009058
9059 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9060 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
TatWai Chong1af34f12020-06-01 20:54:06 -07009061 ldff1_scaled_offset_helper(kHRegSize, kSRegSize, ldff1sh, ld1sh);
9062 ldff1_scaled_offset_helper(kHRegSize, kDRegSize, ldff1sh, ld1sh);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009063
9064 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9065 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
TatWai Chong1af34f12020-06-01 20:54:06 -07009066 ldff1_scaled_offset_helper(kSRegSize, kDRegSize, ldff1sw, ld1sw);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009067
9068 munmap(reinterpret_cast<void*>(data), page_size * 2);
9069}
9070
TatWai Chong1af34f12020-06-01 20:54:06 -07009071static void sve_ldff1_scalar_plus_vector_32_scaled_offset(Test* config,
9072 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009073 auto ldff1_32_scaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009074 config,
9075 data,
9076 std::placeholders::_1,
9077 kSRegSize,
9078 CPURegister::kZRegister,
9079 std::placeholders::_2,
9080 std::placeholders::_3,
9081 std::placeholders::_4,
9082 true);
9083 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9084 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009085 ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9086 ldff1_32_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009087
9088 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9089 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009090 ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9091 ldff1_32_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009092
9093 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9094 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009095 ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9096 ldff1_32_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009097}
9098
9099static void sve_ldff1_scalar_plus_vector_32_unscaled_offset(Test* config,
9100 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009101 auto ldff1_32_unscaled_offset_helper = std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009102 config,
9103 data,
9104 std::placeholders::_1,
9105 kSRegSize,
9106 CPURegister::kZRegister,
9107 std::placeholders::_2,
9108 std::placeholders::_3,
9109 std::placeholders::_4,
9110 false);
9111
9112 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9113 Ld1Macro ld1b = &MacroAssembler::Ld1b;
Martyn Capewella5112342020-06-05 18:20:11 +01009114 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9115 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009116
9117 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9118 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009119 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9120 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009121
9122 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9123 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009124 ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9125 ldff1_32_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009126
9127 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9128 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
Martyn Capewella5112342020-06-05 18:20:11 +01009129 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9130 ldff1_32_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009131
9132 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9133 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009134 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9135 ldff1_32_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009136}
9137
9138static void sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(
9139 Test* config, uintptr_t data) {
9140 auto ldff1_32_unpacked_scaled_offset_helper =
Martyn Capewella5112342020-06-05 18:20:11 +01009141 std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009142 config,
9143 data,
9144 std::placeholders::_1,
9145 kDRegSize,
9146 CPURegister::kZRegister,
9147 std::placeholders::_2,
9148 std::placeholders::_3,
9149 std::placeholders::_4,
9150 true);
9151
9152 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9153 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009154 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9155 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009156
9157 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9158 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009159 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9160 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009161
9162 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9163 Ld1Macro ld1d = &MacroAssembler::Ld1d;
Martyn Capewella5112342020-06-05 18:20:11 +01009164 ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9165 ldff1_32_unpacked_scaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009166
9167 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9168 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009169 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9170 ldff1_32_unpacked_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009171
9172 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9173 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Martyn Capewella5112342020-06-05 18:20:11 +01009174 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9175 ldff1_32_unpacked_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009176}
9177
9178static void sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(
9179 Test* config, uintptr_t data) {
9180 auto ldff1_32_unpacked_unscaled_offset_helper =
Martyn Capewella5112342020-06-05 18:20:11 +01009181 std::bind(&Ldff1Helper<Extend>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009182 config,
9183 data,
9184 std::placeholders::_1,
9185 kDRegSize,
9186 CPURegister::kZRegister,
9187 std::placeholders::_2,
9188 std::placeholders::_3,
9189 std::placeholders::_4,
9190 false);
9191
9192 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9193 Ld1Macro ld1b = &MacroAssembler::Ld1b;
Martyn Capewella5112342020-06-05 18:20:11 +01009194 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, UXTW);
9195 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1b, ld1b, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009196
9197 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9198 Ld1Macro ld1h = &MacroAssembler::Ld1h;
Martyn Capewella5112342020-06-05 18:20:11 +01009199 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, UXTW);
9200 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1h, ld1h, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009201
9202 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9203 Ld1Macro ld1w = &MacroAssembler::Ld1w;
Martyn Capewella5112342020-06-05 18:20:11 +01009204 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, UXTW);
9205 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1w, ld1w, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009206
9207 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9208 Ld1Macro ld1d = &MacroAssembler::Ld1d;
Martyn Capewella5112342020-06-05 18:20:11 +01009209 ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, UXTW);
9210 ldff1_32_unpacked_unscaled_offset_helper(kDRegSize, ldff1d, ld1d, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009211
9212 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9213 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
Martyn Capewella5112342020-06-05 18:20:11 +01009214 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, UXTW);
9215 ldff1_32_unpacked_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009216
9217 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9218 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
Martyn Capewella5112342020-06-05 18:20:11 +01009219 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, UXTW);
9220 ldff1_32_unpacked_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009221
9222 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9223 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
Martyn Capewella5112342020-06-05 18:20:11 +01009224 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, UXTW);
9225 ldff1_32_unpacked_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw, SXTW);
TatWai Chong1af34f12020-06-01 20:54:06 -07009226}
9227
9228static void sve_ldff1_scalar_plus_vector_64_scaled_offset(Test* config,
9229 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009230 auto ldff1_64_scaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009231 config,
9232 data,
9233 std::placeholders::_1,
9234 kDRegSize,
9235 CPURegister::kZRegister,
9236 std::placeholders::_2,
9237 std::placeholders::_3,
Martyn Capewella5112342020-06-05 18:20:11 +01009238 LSL,
TatWai Chong1af34f12020-06-01 20:54:06 -07009239 true);
9240
9241 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9242 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9243 ldff1_64_scaled_offset_helper(kHRegSize, ldff1h, ld1h);
9244
9245 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9246 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9247 ldff1_64_scaled_offset_helper(kSRegSize, ldff1w, ld1w);
9248
9249 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9250 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9251 ldff1_64_scaled_offset_helper(kDRegSize, ldff1d, ld1d);
9252
9253 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9254 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9255 ldff1_64_scaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9256
9257 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9258 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9259 ldff1_64_scaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9260}
9261
9262static void sve_ldff1_scalar_plus_vector_64_unscaled_offset(Test* config,
9263 uintptr_t data) {
Martyn Capewella5112342020-06-05 18:20:11 +01009264 auto ldff1_64_unscaled_offset_helper = std::bind(&Ldff1Helper<Shift>,
TatWai Chong1af34f12020-06-01 20:54:06 -07009265 config,
9266 data,
9267 std::placeholders::_1,
9268 kDRegSize,
9269 CPURegister::kZRegister,
9270 std::placeholders::_2,
9271 std::placeholders::_3,
Martyn Capewella5112342020-06-05 18:20:11 +01009272 NO_SHIFT,
TatWai Chong1af34f12020-06-01 20:54:06 -07009273 false);
9274
9275 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
9276 Ld1Macro ld1b = &MacroAssembler::Ld1b;
9277 ldff1_64_unscaled_offset_helper(kBRegSize, ldff1b, ld1b);
9278
9279 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
9280 Ld1Macro ld1h = &MacroAssembler::Ld1h;
9281 ldff1_64_unscaled_offset_helper(kHRegSize, ldff1h, ld1h);
9282
9283 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
9284 Ld1Macro ld1w = &MacroAssembler::Ld1w;
9285 ldff1_64_unscaled_offset_helper(kSRegSize, ldff1w, ld1w);
9286
9287 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
9288 Ld1Macro ld1d = &MacroAssembler::Ld1d;
9289 ldff1_64_unscaled_offset_helper(kDRegSize, ldff1d, ld1d);
9290
9291 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
9292 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
9293 ldff1_64_unscaled_offset_helper(kBRegSize, ldff1sb, ld1sb);
9294
9295 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
9296 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
9297 ldff1_64_unscaled_offset_helper(kHRegSize, ldff1sh, ld1sh);
9298
9299 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
9300 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
9301 ldff1_64_unscaled_offset_helper(kSRegSize, ldff1sw, ld1sw);
9302}
9303
TatWai Chong6537a9a2020-05-05 14:15:16 -07009304TEST_SVE(sve_ldff1_scalar_plus_vector) {
9305 size_t page_size = sysconf(_SC_PAGE_SIZE);
9306 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9307
9308 // Allocate two pages, then mprotect the second one to make it inaccessible.
9309 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9310 page_size * 2,
9311 PROT_READ | PROT_WRITE,
9312 MAP_PRIVATE | MAP_ANONYMOUS,
9313 -1,
9314 0));
9315 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9316
9317 // Fill the accessible page with arbitrary data.
9318 for (size_t i = 0; i < page_size; i++) {
9319 // Reverse bits so we get a mixture of positive and negative values.
9320 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9321 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9322 }
9323
TatWai Chong1af34f12020-06-01 20:54:06 -07009324 sve_ldff1_scalar_plus_vector_32_scaled_offset(config, data);
9325 sve_ldff1_scalar_plus_vector_32_unscaled_offset(config, data);
9326 sve_ldff1_scalar_plus_vector_32_unpacked_scaled_offset(config, data);
9327 sve_ldff1_scalar_plus_vector_32_unpacked_unscaled_offset(config, data);
9328 sve_ldff1_scalar_plus_vector_64_scaled_offset(config, data);
9329 sve_ldff1_scalar_plus_vector_64_unscaled_offset(config, data);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009330
9331 munmap(reinterpret_cast<void*>(data), page_size * 2);
9332}
9333
Martyn Capewell5f9b3802020-03-24 16:16:36 +00009334TEST_SVE(sve_ldnf1) {
9335 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
9336 CPUFeatures::kNEON,
9337 CPUFeatures::kFP);
9338 START();
9339
9340 size_t page_size = sysconf(_SC_PAGE_SIZE);
9341 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9342
9343 // Allocate two pages, fill them with data, then mprotect the second one to
9344 // make it inaccessible.
9345 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9346 page_size * 2,
9347 PROT_READ | PROT_WRITE,
9348 MAP_PRIVATE | MAP_ANONYMOUS,
9349 -1,
9350 0));
9351
9352 // Fill the pages with arbitrary data.
9353 for (size_t i = 0; i < page_size; i++) {
9354 // Reverse bits so we get a mixture of positive and negative values.
9355 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9356 memcpy(reinterpret_cast<void*>(data + i), &byte, 1);
9357 }
9358
9359 mprotect(reinterpret_cast<void*>(data + page_size), page_size, PROT_NONE);
9360
9361 __ Setffr();
9362 __ Ptrue(p0.VnB());
9363 __ Dup(z10.VnB(), 0);
9364
9365 // Move an address that points to the last unprotected eight bytes.
9366 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kBRegSizeInBytes) / 2);
9367
9368 // Load, non-faulting, a vector of bytes from x0. At most, eight bytes will be
9369 // loaded, the rest being in a protected page.
9370 __ Ldnf1b(z0.VnB(), p0.Zeroing(), SVEMemOperand(x0));
9371 __ Rdffr(p1.VnB());
9372 __ Setffr();
9373
9374 // Create references using the FFR value in p1 to zero the undefined lanes.
9375 __ Sel(z0.VnB(), p1, z0.VnB(), z10.VnB());
9376 __ Ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0));
9377
9378 // Repeat for larger elements and different addresses, giving different FFR
9379 // results.
9380 __ Add(x1, x0, 1);
9381 __ Ldnf1h(z1.VnH(), p0.Zeroing(), SVEMemOperand(x1));
9382 __ Rdffr(p1.VnB());
9383 __ Setffr();
9384 __ Sel(z1.VnH(), p1, z1.VnH(), z10.VnH());
9385 __ Ld1h(z21.VnH(), p1.Zeroing(), SVEMemOperand(x1));
9386
9387 __ Add(x1, x0, 2);
9388 __ Ldnf1w(z2.VnS(), p0.Zeroing(), SVEMemOperand(x1));
9389 __ Rdffr(p1.VnB());
9390 __ Setffr();
9391 __ Sel(z2.VnS(), p1, z2.VnS(), z10.VnS());
9392 __ Ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x1));
9393
9394 __ Sub(x1, x0, 1);
9395 __ Ldnf1d(z3.VnD(), p0.Zeroing(), SVEMemOperand(x1));
9396 __ Rdffr(p1.VnB());
9397 __ Setffr();
9398 __ Sel(z3.VnD(), p1, z3.VnD(), z10.VnD());
9399 __ Ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x1));
9400
9401 // Load from previous VL-sized area of memory. All of this should be in the
9402 // accessible page.
9403 __ Ldnf1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9404 __ Rdffr(p1.VnB());
9405 __ Setffr();
9406 __ Sel(z4.VnB(), p1, z4.VnB(), z10.VnB());
9407 __ Ld1b(z24.VnB(), p1.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
9408
9409 // Repeat partial load for larger element size.
9410 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kSRegSizeInBytes) / 2);
9411 __ Ldnf1b(z5.VnS(), p0.Zeroing(), SVEMemOperand(x0));
9412 __ Rdffr(p1.VnB());
9413 __ Setffr();
9414 __ Sel(z5.VnS(), p1, z5.VnS(), z10.VnS());
9415 __ Ld1b(z25.VnS(), p1.Zeroing(), SVEMemOperand(x0));
9416
9417 // Repeat for sign extension.
9418 __ Mov(x0, data + page_size - (kQRegSizeInBytes / kHRegSizeInBytes) / 2);
9419 __ Ldnf1sb(z6.VnH(), p0.Zeroing(), SVEMemOperand(x0));
9420 __ Rdffr(p1.VnB());
9421 __ Setffr();
9422 __ Sel(z6.VnH(), p1, z6.VnH(), z10.VnH());
9423 __ Ld1sb(z26.VnH(), p1.Zeroing(), SVEMemOperand(x0));
9424
9425 END();
9426
9427 if (CAN_RUN()) {
9428 RUN();
9429 ASSERT_EQUAL_SVE(z20, z0);
9430 ASSERT_EQUAL_SVE(z21, z1);
9431 ASSERT_EQUAL_SVE(z22, z2);
9432 ASSERT_EQUAL_SVE(z23, z3);
9433 ASSERT_EQUAL_SVE(z24, z4);
9434 ASSERT_EQUAL_SVE(z25, z5);
9435 ASSERT_EQUAL_SVE(z26, z6);
9436 }
9437
9438 munmap(reinterpret_cast<void*>(data), page_size * 2);
9439}
9440
TatWai Chongcd3f6c52020-06-14 00:42:39 -07009441// Emphasis on test if the modifiers are propagated and simulated correctly.
9442TEST_SVE(sve_ldff1_regression_test) {
9443 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9444 START();
9445
9446 size_t page_size = sysconf(_SC_PAGE_SIZE);
9447 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9448
9449 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9450 page_size * 2,
9451 PROT_READ | PROT_WRITE,
9452 MAP_PRIVATE | MAP_ANONYMOUS,
9453 -1,
9454 0));
9455 uintptr_t middle = data + page_size;
9456 // Fill the accessible page with arbitrary data.
9457 for (size_t i = 0; i < page_size; i++) {
9458 // Reverse bits so we get a mixture of positive and negative values.
9459 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9460 memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9461 // Make one bit roughly different in every byte and copy the bytes in the
9462 // reverse direction that convenient to verifying the loads in negative
9463 // indexes.
9464 byte += 1;
9465 memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9466 }
9467
9468 PRegister all = p6;
9469 __ Ptrue(all.VnB());
9470
9471 __ Mov(x0, middle);
9472 __ Index(z31.VnS(), 0, 3);
9473 __ Neg(z30.VnS(), z31.VnS());
9474
9475 __ Setffr();
9476
9477 // Scalar plus vector 32 unscaled offset
9478 __ Ldff1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9479 __ Ldff1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9480 __ Ldff1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9481 __ Ldff1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9482 __ Ldff1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9483
9484 // Scalar plus vector 32 scaled offset
9485 __ Ldff1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9486 __ Ldff1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9487 __ Ldff1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9488
9489 __ Index(z31.VnD(), 0, 3);
9490 __ Neg(z30.VnD(), z31.VnD());
9491
9492 // Ensure only the low 32 bits are used for the testing with positive index
9493 // values. It also test if the indexes are treated as positive in `uxtw` form.
9494 __ Mov(x3, 0x8000000080000000);
9495 __ Dup(z28.VnD(), x3);
9496 __ Sub(x2, x0, 0x80000000);
9497 __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9498
9499 // Scalar plus vector 32 unpacked unscaled offset
9500 __ Ldff1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9501 __ Ldff1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9502 __ Ldff1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9503 __ Ldff1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9504 __ Ldff1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9505 __ Ldff1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9506
9507 // Scalar plus vector 32 unpacked scaled offset
9508 __ Ldff1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9509 __ Ldff1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9510 __ Ldff1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9511 __ Ldff1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9512 __ Ldff1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9513
9514 __ Sub(x0, x0, x3);
9515 // Note that the positive indexes has been added by `0x8000000080000000`. The
9516 // wrong address will be accessed if the address is treated as negative.
9517
9518 // Scalar plus vector 64 unscaled offset
9519 __ Ldff1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9520 __ Ldff1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9521 __ Ldff1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9522 __ Ldff1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9523 __ Ldff1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9524
9525 // Scalar plus vector 64 scaled offset
9526 __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
9527 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9528 __ Ldff1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9529 __ Ldff1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9530
9531 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
9532 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9533 __ Ldff1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9534 __ Ldff1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9535
9536 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
9537 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9538 __ Ldff1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9539
9540 __ Rdffr(p1.VnB());
9541 __ Cntp(x10, all, p1.VnB());
9542
9543 END();
9544
9545 if (CAN_RUN()) {
9546 RUN();
9547
9548 int64_t loaded_data_in_bytes = core.xreg(x10.GetCode());
9549 // Only check 128 bits in this test.
9550 if (loaded_data_in_bytes < kQRegSizeInBytes) {
9551 // Report a warning when we hit fault-tolerant loads before all expected
9552 // loads performed.
9553 printf(
9554 "WARNING: Fault-tolerant loads detected faults before the "
9555 "expected loads completed.\n");
9556 return;
9557 }
9558
9559 // Scalar plus vector 32 unscaled offset
9560 uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9561 uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9562 uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9563 uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9564 uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9565
9566 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9567 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9568 ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9569 ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9570 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9571
9572 // Scalar plus vector 32 scaled offset
9573 uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9574 uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9575 uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9576
9577 ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9578 ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9579 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9580
9581 // Scalar plus vector 32 unpacked unscaled offset
9582 uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9583 uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9584 uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9585 uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9586 uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9587 uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9588
9589 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9590 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9591 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9592 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9593 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9594 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9595
9596 // Scalar plus vector 32 unpacked scaled offset
9597 uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9598 uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9599 uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9600 uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9601 uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9602
9603 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9604 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9605 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9606 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9607 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9608
9609 // Scalar plus vector 64 unscaled offset
9610 uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9611 uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9612 uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9613 uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9614 uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9615
9616 ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9617 ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9618 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9619 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9620 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9621
9622 uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9623 uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9624 uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9625 uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9626 uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9627
9628 // Scalar plus vector 64 scaled offset
9629 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9630 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9631 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9632 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9633 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9634 }
9635}
9636
Martyn Capewella5112342020-06-05 18:20:11 +01009637// Emphasis on test if the modifiers are propagated and simulated correctly.
9638TEST_SVE(sve_ld1_regression_test) {
9639 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9640 START();
9641
9642 size_t page_size = sysconf(_SC_PAGE_SIZE);
9643 VIXL_ASSERT(page_size > static_cast<size_t>(config->sve_vl_in_bytes()));
9644
9645 uintptr_t data = reinterpret_cast<uintptr_t>(mmap(NULL,
9646 page_size * 2,
9647 PROT_READ | PROT_WRITE,
9648 MAP_PRIVATE | MAP_ANONYMOUS,
9649 -1,
9650 0));
9651 uintptr_t middle = data + page_size;
9652 // Fill the accessible page with arbitrary data.
9653 for (size_t i = 0; i < page_size; i++) {
9654 // Reverse bits so we get a mixture of positive and negative values.
9655 uint8_t byte = ReverseBits(static_cast<uint8_t>(i));
9656 memcpy(reinterpret_cast<void*>(middle + i), &byte, 1);
9657 // Make one bit roughly different in every byte and copy the bytes in the
9658 // reverse direction that convenient to verifying the loads in negative
9659 // indexes.
9660 byte += 1;
9661 memcpy(reinterpret_cast<void*>(middle - i), &byte, 1);
9662 }
9663
9664 PRegister all = p6;
9665 __ Ptrue(all.VnB());
9666
9667 __ Mov(x0, middle);
9668 __ Index(z31.VnS(), 0, 3);
9669 __ Neg(z30.VnS(), z31.VnS());
9670
9671 // Scalar plus vector 32 unscaled offset
9672 __ Ld1b(z1.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9673 __ Ld1h(z2.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9674 __ Ld1w(z3.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9675 __ Ld1sb(z4.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW));
9676 __ Ld1sh(z5.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW));
9677
9678 // Scalar plus vector 32 scaled offset
9679 __ Ld1h(z6.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 1));
9680 __ Ld1w(z7.VnS(), all.Zeroing(), SVEMemOperand(x0, z31.VnS(), UXTW, 2));
9681 __ Ld1sh(z8.VnS(), all.Zeroing(), SVEMemOperand(x0, z30.VnS(), SXTW, 1));
9682
9683 __ Index(z31.VnD(), 0, 3);
9684 __ Neg(z30.VnD(), z31.VnD());
9685
9686 // Ensure only the low 32 bits are used for the testing with positive index
9687 // values. It also test if the indexes are treated as positive in `uxtw` form.
9688 __ Mov(x3, 0x8000000080000000);
9689 __ Dup(z28.VnD(), x3);
9690 __ Sub(x2, x0, 0x80000000);
9691 __ Add(z29.VnD(), z31.VnD(), z28.VnD());
9692
9693 // Scalar plus vector 32 unpacked unscaled offset
9694 __ Ld1b(z9.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9695 __ Ld1h(z10.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9696 __ Ld1w(z11.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9697 __ Ld1sb(z12.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9698 __ Ld1sh(z13.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW));
9699 __ Ld1sw(z14.VnD(), all.Zeroing(), SVEMemOperand(x2, z29.VnD(), UXTW));
9700
9701 // Scalar plus vector 32 unpacked scaled offset
9702 __ Ld1h(z15.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9703 __ Ld1w(z16.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9704 __ Ld1d(z17.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 3));
9705 __ Ld1sh(z18.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), SXTW, 1));
9706 __ Ld1sw(z19.VnD(), all.Zeroing(), SVEMemOperand(x0, z31.VnD(), UXTW, 2));
9707
9708 __ Sub(x0, x0, x3);
9709 // Note that the positive indexes has been added by `0x8000000080000000`. The
9710 // wrong address will be accessed if the address is treated as negative.
9711
9712 // Scalar plus vector 64 unscaled offset
9713 __ Ld1b(z20.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9714 __ Ld1h(z21.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9715 __ Ld1w(z22.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9716 __ Ld1sh(z23.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9717 __ Ld1sw(z24.VnD(), all.Zeroing(), SVEMemOperand(x0, z29.VnD()));
9718
9719 // Scalar plus vector 64 scaled offset
9720 __ Lsr(z29.VnD(), z28.VnD(), 1); // Shift right to 0x4000000040000000
9721 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9722 __ Ld1h(z25.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9723 __ Ld1sh(z26.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 1));
9724
9725 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x2000000020000000
9726 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9727 __ Ld1w(z27.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9728 __ Ld1sw(z28.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 2));
9729
9730 __ Lsr(z29.VnD(), z29.VnD(), 1); // Shift right to 0x1000000010000000
9731 __ Add(z30.VnD(), z31.VnD(), z29.VnD());
9732 __ Ld1d(z29.VnD(), all.Zeroing(), SVEMemOperand(x0, z30.VnD(), LSL, 3));
9733
9734 END();
9735
9736 if (CAN_RUN()) {
9737 RUN();
9738
9739 // Scalar plus vector 32 unscaled offset
9740 uint32_t expected_z1[] = {0x00000090, 0x00000060, 0x000000c0, 0x00000001};
9741 uint32_t expected_z2[] = {0x00001191, 0x0000a161, 0x000041c1, 0x00008001};
9742 uint32_t expected_z3[] = {0x30d05090, 0x9010e060, 0x60a020c0, 0xc0408001};
9743 uint32_t expected_z4[] = {0xffffff91, 0x00000061, 0xffffffc1, 0x00000001};
9744 uint32_t expected_z5[] = {0x00005090, 0xffffe060, 0x000020c0, 0xffff8001};
9745
9746 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
9747 ASSERT_EQUAL_SVE(expected_z2, z2.VnS());
9748 ASSERT_EQUAL_SVE(expected_z3, z3.VnS());
9749 ASSERT_EQUAL_SVE(expected_z4, z4.VnS());
9750 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
9751
9752 // Scalar plus vector 32 scaled offset
9753 uint32_t expected_z6[] = {0x0000c848, 0x0000b030, 0x0000e060, 0x00008001};
9754 uint32_t expected_z7[] = {0xe464a424, 0xd8589818, 0xf070b030, 0xc0408001};
9755 uint32_t expected_z8[] = {0xffff8949, 0xffffd131, 0xffffa161, 0xffff8001};
9756
9757 ASSERT_EQUAL_SVE(expected_z6, z6.VnS());
9758 ASSERT_EQUAL_SVE(expected_z7, z7.VnS());
9759 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
9760
9761 // Scalar plus vector 32 unpacked unscaled offset
9762 uint64_t expected_z9[] = {0x00000000000000c0, 0x0000000000000001};
9763 uint64_t expected_z10[] = {0x00000000000041c1, 0x0000000000008001};
9764 uint64_t expected_z11[] = {0x0000000060a020c0, 0x00000000c0408001};
9765 uint64_t expected_z12[] = {0xffffffffffffffc0, 0x0000000000000001};
9766 uint64_t expected_z13[] = {0x00000000000041c1, 0xffffffffffff8001};
9767 uint64_t expected_z14[] = {0x0000000060a020c0, 0xffffffffc0408001};
9768
9769 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
9770 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
9771 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
9772 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
9773 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
9774 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
9775
9776 // Scalar plus vector 32 unpacked scaled offset
9777 uint64_t expected_z15[] = {0x000000000000a161, 0x0000000000008001};
9778 uint64_t expected_z16[] = {0x00000000f070b030, 0x00000000c0408001};
9779 uint64_t expected_z17[] = {0x8949c929a969e919, 0xe060a020c0408001};
9780 uint64_t expected_z18[] = {0xffffffffffffa161, 0xffffffffffff8001};
9781 uint64_t expected_z19[] = {0xfffffffff070b030, 0xffffffffc0408001};
9782
9783 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
9784 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
9785 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
9786 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
9787 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
9788
9789 // Scalar plus vector 64 unscaled offset
9790 uint64_t expected_z20[] = {0x00000000000000c0, 0x0000000000000001};
9791 uint64_t expected_z21[] = {0x00000000000020c0, 0x0000000000008001};
9792 uint64_t expected_z22[] = {0x0000000060a020c0, 0x00000000c0408001};
9793 uint64_t expected_z23[] = {0x00000000000020c0, 0xffffffffffff8001};
9794 uint64_t expected_z24[] = {0x0000000060a020c0, 0xffffffffc0408001};
9795
9796 ASSERT_EQUAL_SVE(expected_z20, z20.VnD());
9797 ASSERT_EQUAL_SVE(expected_z21, z21.VnD());
9798 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
9799 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
9800 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
9801
9802 uint64_t expected_z25[] = {0x000000000000e060, 0x0000000000008001};
9803 uint64_t expected_z26[] = {0xffffffffffffe060, 0xffffffffffff8001};
9804 uint64_t expected_z27[] = {0x00000000f070b030, 0x00000000c0408001};
9805 uint64_t expected_z28[] = {0xfffffffff070b030, 0xffffffffc0408001};
9806 uint64_t expected_z29[] = {0xf878b838d8589818, 0xe060a020c0408001};
9807
9808 // Scalar plus vector 64 scaled offset
9809 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
9810 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
9811 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
9812 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
9813 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
9814 }
9815}
9816
TatWai Chong113d9192020-05-19 01:02:36 -07009817// Test gather loads by comparing them with the result of a set of equivalent
9818// scalar loads.
Martyn Capewella5112342020-06-05 18:20:11 +01009819template <typename T>
TatWai Chong113d9192020-05-19 01:02:36 -07009820static void GatherLoadScalarPlusVectorHelper(Test* config,
9821 unsigned msize_in_bits,
9822 unsigned esize_in_bits,
9823 Ld1Macro ld1,
TatWai Chong6537a9a2020-05-05 14:15:16 -07009824 Ld1Macro ldff1,
Martyn Capewella5112342020-06-05 18:20:11 +01009825 T mod,
TatWai Chong113d9192020-05-19 01:02:36 -07009826 bool is_signed,
9827 bool is_scaled) {
9828 // SVE supports 32- and 64-bit addressing for gather loads.
9829 VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9830 static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9831
9832 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9833 START();
9834
9835 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
9836 int vl = config->sve_vl_in_bytes();
9837
9838 uint64_t addresses[kMaxLaneCount];
9839 uint64_t offsets[kMaxLaneCount];
9840 uint64_t max_address = 0;
9841 uint64_t buffer_size = vl * 64;
9842 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9843 // Fill the buffer with arbitrary data. Meanwhile, create the random addresses
9844 // and offsets into the buffer placed in the argument list.
9845 BufferFillingHelper(data,
9846 buffer_size,
9847 msize_in_bytes,
9848 kMaxLaneCount,
9849 offsets,
9850 addresses,
9851 &max_address);
9852
9853 ZRegister zn = z0.WithLaneSize(esize_in_bits);
9854 ZRegister zt_ref = z1.WithLaneSize(esize_in_bits);
Martyn Capewella5112342020-06-05 18:20:11 +01009855 ZRegister zt = z2.WithLaneSize(esize_in_bits);
9856 ZRegister zt_ff = z3.WithLaneSize(esize_in_bits);
9857 PRegisterWithLaneSize pg_ff = p1.WithLaneSize(esize_in_bits);
9858 PRegisterWithLaneSize pg_diff = p2.WithLaneSize(esize_in_bits);
TatWai Chong113d9192020-05-19 01:02:36 -07009859
9860 int shift = 0;
9861 if (is_scaled) {
9862 shift = std::log2(msize_in_bytes);
9863 for (unsigned i = 0; i < kMaxLaneCount; i++) {
9864 // Ensure the offsets are the multiple of the scale factor of the
9865 // operation.
9866 offsets[i] = (offsets[i] >> shift) << shift;
9867 addresses[i] = data + offsets[i];
9868 }
9869 }
9870
9871 PRegister all = p6;
9872 __ Ptrue(all.WithLaneSize(esize_in_bits));
9873
9874 PRegisterZ pg = p0.Zeroing();
9875 Initialise(&masm,
9876 pg,
9877 0x9abcdef012345678,
9878 0xabcdef0123456789,
9879 0xf4f3f1f0fefdfcfa,
9880 0xf9f8f6f5f3f2f1ff);
9881
9882 __ Mov(x0, data);
9883
9884 // Generate a reference result for scalar-plus-scalar form using scalar loads.
9885 ScalarLoadHelper(&masm,
9886 vl,
9887 addresses,
9888 zt_ref,
9889 pg,
9890 esize_in_bits,
9891 msize_in_bits,
9892 is_signed);
9893
9894 InsrHelper(&masm, zn, offsets);
9895 if (is_scaled) {
9896 // Scale down the offsets if testing scaled-offset operation.
9897 __ Lsr(zn, zn, shift);
9898 }
9899
Martyn Capewella5112342020-06-05 18:20:11 +01009900 (masm.*ld1)(zt, pg, SVEMemOperand(x0, zn, mod, shift));
TatWai Chong113d9192020-05-19 01:02:36 -07009901
TatWai Chong6537a9a2020-05-05 14:15:16 -07009902 Register ffr_check_count = x17;
9903 __ Mov(ffr_check_count, 0);
9904
TatWai Chong6537a9a2020-05-05 14:15:16 -07009905 // Test the data correctness in which the data gather load from different
9906 // addresses. The first-fault behavior test is emphasized in `Ldff1Helper`.
9907 __ Setffr();
Martyn Capewella5112342020-06-05 18:20:11 +01009908 (masm.*ldff1)(zt_ff, pg, SVEMemOperand(x0, zn, mod, shift));
9909
9910 // Compare these two vector register and place the different to
9911 // `ffr_check_count`.
9912 __ Rdffrs(pg_ff.VnB(), all.Zeroing());
9913 __ Cmpeq(pg_diff, all.Zeroing(), zt_ref, zt_ff);
9914 __ Eor(pg_diff.VnB(), all.Zeroing(), pg_diff.VnB(), pg_ff.VnB());
9915 __ Incp(ffr_check_count, pg_diff);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009916
TatWai Chong113d9192020-05-19 01:02:36 -07009917 END();
9918
9919 if (CAN_RUN()) {
9920 RUN();
9921
Martyn Capewella5112342020-06-05 18:20:11 +01009922 ASSERT_EQUAL_SVE(zt_ref, zt);
TatWai Chong6537a9a2020-05-05 14:15:16 -07009923 ASSERT_EQUAL_64(0, ffr_check_count);
TatWai Chong113d9192020-05-19 01:02:36 -07009924 }
9925
9926 free(reinterpret_cast<void*>(data));
9927}
9928
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009929// Test gather loads by comparing them with the result of a set of equivalent
9930// scalar loads.
9931template <typename F>
TatWai Chong113d9192020-05-19 01:02:36 -07009932static void GatherLoadScalarPlusScalarOrImmHelper(Test* config,
9933 unsigned msize_in_bits,
9934 unsigned esize_in_bits,
9935 F sve_ld1,
9936 bool is_signed) {
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009937 // SVE supports 32- and 64-bit addressing for gather loads.
9938 VIXL_ASSERT((esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
9939 static const unsigned kMaxLaneCount = kZRegMaxSize / kSRegSize;
9940
9941 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
9942 START();
9943
9944 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009945 int vl = config->sve_vl_in_bytes();
9946
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009947 uint64_t addresses[kMaxLaneCount];
9948 uint64_t offsets[kMaxLaneCount];
9949 uint64_t max_address = 0;
TatWai Chong85e15102020-05-04 21:00:40 -07009950 uint64_t buffer_size = vl * 64;
9951 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
9952 BufferFillingHelper(data,
9953 buffer_size,
9954 msize_in_bytes,
9955 kMaxLaneCount,
9956 offsets,
9957 addresses,
9958 &max_address);
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009959
9960 // Maximised offsets, to ensure that the address calculation is modulo-2^64,
9961 // and that the vector addresses are not sign-extended.
9962 uint64_t uint_e_max = (esize_in_bits == kDRegSize) ? UINT64_MAX : UINT32_MAX;
9963 uint64_t maxed_offsets[kMaxLaneCount];
9964 uint64_t maxed_offsets_imm = max_address - uint_e_max;
9965 for (unsigned i = 0; i < kMaxLaneCount; i++) {
9966 maxed_offsets[i] = addresses[i] - maxed_offsets_imm;
9967 }
9968
9969 ZRegister zn = z0.WithLaneSize(esize_in_bits);
9970 ZRegister zt_addresses = z1.WithLaneSize(esize_in_bits);
9971 ZRegister zt_offsets = z2.WithLaneSize(esize_in_bits);
9972 ZRegister zt_maxed = z3.WithLaneSize(esize_in_bits);
9973 ZRegister zt_ref = z4.WithLaneSize(esize_in_bits);
9974
9975 PRegisterZ pg = p0.Zeroing();
9976 Initialise(&masm,
9977 pg,
9978 0x9abcdef012345678,
9979 0xabcdef0123456789,
9980 0xf4f3f1f0fefdfcfa,
9981 0xf9f8f6f5f3f2f0ff);
9982
9983 // Execute each load.
9984
9985 if (esize_in_bits == kDRegSize) {
9986 // Only test `addresses` if we can use 64-bit pointers. InsrHelper will fail
9987 // if any value won't fit in a lane of zn.
9988 InsrHelper(&masm, zn, addresses);
9989 (masm.*sve_ld1)(zt_addresses, pg, SVEMemOperand(zn));
9990 }
9991
9992 InsrHelper(&masm, zn, offsets);
9993 (masm.*sve_ld1)(zt_offsets, pg, SVEMemOperand(zn, data));
9994
9995 InsrHelper(&masm, zn, maxed_offsets);
9996 (masm.*sve_ld1)(zt_maxed, pg, SVEMemOperand(zn, maxed_offsets_imm));
9997
Jacob Bramleydcdbd752020-01-20 11:47:36 +00009998 // Generate a reference result using scalar loads.
TatWai Chong85e15102020-05-04 21:00:40 -07009999 ScalarLoadHelper(&masm,
10000 vl,
10001 addresses,
10002 zt_ref,
10003 pg,
10004 esize_in_bits,
10005 msize_in_bits,
10006 is_signed);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010007
10008 END();
10009
10010 if (CAN_RUN()) {
10011 RUN();
10012
10013 if (esize_in_bits == kDRegSize) {
10014 ASSERT_EQUAL_SVE(zt_ref, zt_addresses);
10015 }
10016 ASSERT_EQUAL_SVE(zt_ref, zt_offsets);
10017 ASSERT_EQUAL_SVE(zt_ref, zt_maxed);
10018 }
10019
10020 free(reinterpret_cast<void*>(data));
10021}
10022
10023TEST_SVE(sve_ld1b_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010024 GatherLoadScalarPlusScalarOrImmHelper(config,
10025 kBRegSize,
10026 kDRegSize,
10027 &MacroAssembler::Ld1b,
10028 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010029}
10030
10031TEST_SVE(sve_ld1h_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010032 GatherLoadScalarPlusScalarOrImmHelper(config,
10033 kHRegSize,
10034 kDRegSize,
10035 &MacroAssembler::Ld1h,
10036 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010037}
10038
10039TEST_SVE(sve_ld1w_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010040 GatherLoadScalarPlusScalarOrImmHelper(config,
10041 kSRegSize,
10042 kDRegSize,
10043 &MacroAssembler::Ld1w,
10044 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010045}
10046
10047TEST_SVE(sve_ld1d_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010048 GatherLoadScalarPlusScalarOrImmHelper(config,
10049 kDRegSize,
10050 kDRegSize,
10051 &MacroAssembler::Ld1d,
10052 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010053}
10054
10055TEST_SVE(sve_ld1sb_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010056 GatherLoadScalarPlusScalarOrImmHelper(config,
10057 kBRegSize,
10058 kDRegSize,
10059 &MacroAssembler::Ld1sb,
10060 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010061}
10062
10063TEST_SVE(sve_ld1sh_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010064 GatherLoadScalarPlusScalarOrImmHelper(config,
10065 kHRegSize,
10066 kDRegSize,
10067 &MacroAssembler::Ld1sh,
10068 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010069}
10070
10071TEST_SVE(sve_ld1sw_64bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010072 GatherLoadScalarPlusScalarOrImmHelper(config,
10073 kSRegSize,
10074 kDRegSize,
10075 &MacroAssembler::Ld1sw,
10076 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010077}
10078
10079TEST_SVE(sve_ld1b_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010080 GatherLoadScalarPlusScalarOrImmHelper(config,
10081 kBRegSize,
10082 kSRegSize,
10083 &MacroAssembler::Ld1b,
10084 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010085}
10086
10087TEST_SVE(sve_ld1h_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010088 GatherLoadScalarPlusScalarOrImmHelper(config,
10089 kHRegSize,
10090 kSRegSize,
10091 &MacroAssembler::Ld1h,
10092 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010093}
10094
10095TEST_SVE(sve_ld1w_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010096 GatherLoadScalarPlusScalarOrImmHelper(config,
10097 kSRegSize,
10098 kSRegSize,
10099 &MacroAssembler::Ld1w,
10100 false);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010101}
10102
10103TEST_SVE(sve_ld1sb_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010104 GatherLoadScalarPlusScalarOrImmHelper(config,
10105 kBRegSize,
10106 kSRegSize,
10107 &MacroAssembler::Ld1sb,
10108 true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010109}
10110
10111TEST_SVE(sve_ld1sh_32bit_vector_plus_immediate) {
TatWai Chong113d9192020-05-19 01:02:36 -070010112 GatherLoadScalarPlusScalarOrImmHelper(config,
10113 kHRegSize,
10114 kSRegSize,
10115 &MacroAssembler::Ld1sh,
10116 true);
10117}
10118
Martyn Capewella5112342020-06-05 18:20:11 +010010119TEST_SVE(sve_ld1_scalar_plus_vector_32_scaled_offset) {
10120 auto ld1_32_scaled_offset_helper =
10121 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10122 config,
10123 std::placeholders::_1,
10124 kSRegSize,
10125 std::placeholders::_2,
10126 std::placeholders::_3,
10127 std::placeholders::_4,
10128 std::placeholders::_5,
10129 true);
10130
10131 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10132 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10133 ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10134 ld1_32_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10135
10136 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10137 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10138 ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10139 ld1_32_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10140
10141 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10142 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10143 ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10144 ld1_32_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010145}
10146
Martyn Capewella5112342020-06-05 18:20:11 +010010147TEST_SVE(sve_ld1_scalar_plus_vector_32_unscaled_offset) {
10148 auto ld1_32_unscaled_offset_helper =
10149 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10150 config,
10151 std::placeholders::_1,
10152 kSRegSize,
10153 std::placeholders::_2,
10154 std::placeholders::_3,
10155 std::placeholders::_4,
10156 std::placeholders::_5,
10157 false);
TatWai Chong113d9192020-05-19 01:02:36 -070010158
Martyn Capewella5112342020-06-05 18:20:11 +010010159 Ld1Macro ld1b = &MacroAssembler::Ld1b;
10160 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10161 ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, UXTW, false);
10162 ld1_32_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, SXTW, false);
10163
10164 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10165 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10166 ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10167 ld1_32_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10168
10169 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10170 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10171 ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10172 ld1_32_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10173
10174 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10175 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10176 ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, UXTW, true);
10177 ld1_32_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, SXTW, true);
10178
10179 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10180 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10181 ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10182 ld1_32_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010183}
10184
Martyn Capewella5112342020-06-05 18:20:11 +010010185TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_scaled_offset) {
10186 auto ld1_32_unpacked_scaled_offset_helper =
10187 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10188 config,
10189 std::placeholders::_1,
10190 kDRegSize,
10191 std::placeholders::_2,
10192 std::placeholders::_3,
10193 std::placeholders::_4,
10194 std::placeholders::_5,
10195 true);
TatWai Chong113d9192020-05-19 01:02:36 -070010196
Martyn Capewella5112342020-06-05 18:20:11 +010010197 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10198 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10199 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10200 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10201
10202 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10203 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10204 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10205 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10206
10207 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10208 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10209 ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10210 ld1_32_unpacked_scaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10211
10212 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10213 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10214 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10215 ld1_32_unpacked_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10216
10217 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10218 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10219 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10220 ld1_32_unpacked_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010221}
10222
Martyn Capewella5112342020-06-05 18:20:11 +010010223TEST_SVE(sve_ld1_scalar_plus_vector_32_unpacked_unscaled_offset) {
10224 auto ld1_32_unpacked_unscaled_offset_helper =
10225 std::bind(&GatherLoadScalarPlusVectorHelper<Extend>,
10226 config,
10227 std::placeholders::_1,
10228 kDRegSize,
10229 std::placeholders::_2,
10230 std::placeholders::_3,
10231 std::placeholders::_4,
10232 std::placeholders::_5,
10233 false);
10234
10235 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10236 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10237 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, UXTW, false);
10238 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, SXTW, false);
10239
10240 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10241 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10242 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, UXTW, false);
10243 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, SXTW, false);
10244
10245 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10246 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10247 ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, UXTW, false);
10248 ld1_32_unpacked_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, SXTW, false);
10249
10250 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10251 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10252 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, UXTW, true);
10253 ld1_32_unpacked_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, SXTW, true);
10254
10255 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10256 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10257 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, UXTW, true);
10258 ld1_32_unpacked_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, SXTW, true);
TatWai Chong113d9192020-05-19 01:02:36 -070010259}
10260
Martyn Capewella5112342020-06-05 18:20:11 +010010261TEST_SVE(sve_ld1_scalar_plus_vector_64_scaled_offset) {
10262 auto ld1_64_scaled_offset_helper =
10263 std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10264 config,
10265 std::placeholders::_1,
10266 kDRegSize,
10267 std::placeholders::_2,
10268 std::placeholders::_3,
10269 LSL,
10270 std::placeholders::_4,
10271 true);
TatWai Chong113d9192020-05-19 01:02:36 -070010272
Martyn Capewella5112342020-06-05 18:20:11 +010010273 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10274 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10275 ld1_64_scaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10276
10277 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10278 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10279 ld1_64_scaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10280
10281 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10282 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10283 ld1_64_scaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10284
10285 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10286 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10287 ld1_64_scaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10288
10289 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10290 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10291 ld1_64_scaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
10292}
10293
10294TEST_SVE(sve_ld1_scalar_plus_vector_64_unscaled_offset) {
10295 auto ld1_64_unscaled_offset_helper =
10296 std::bind(&GatherLoadScalarPlusVectorHelper<Shift>,
10297 config,
10298 std::placeholders::_1,
10299 kDRegSize,
10300 std::placeholders::_2,
10301 std::placeholders::_3,
10302 NO_SHIFT,
10303 std::placeholders::_4,
10304 false);
10305
10306 Ld1Macro ld1b = &MacroAssembler::Ld1b;
10307 Ld1Macro ldff1b = &MacroAssembler::Ldff1b;
10308 ld1_64_unscaled_offset_helper(kBRegSize, ld1b, ldff1b, false);
10309
10310 Ld1Macro ld1h = &MacroAssembler::Ld1h;
10311 Ld1Macro ldff1h = &MacroAssembler::Ldff1h;
10312 ld1_64_unscaled_offset_helper(kHRegSize, ld1h, ldff1h, false);
10313
10314 Ld1Macro ld1w = &MacroAssembler::Ld1w;
10315 Ld1Macro ldff1w = &MacroAssembler::Ldff1w;
10316 ld1_64_unscaled_offset_helper(kSRegSize, ld1w, ldff1w, false);
10317
10318 Ld1Macro ld1d = &MacroAssembler::Ld1d;
10319 Ld1Macro ldff1d = &MacroAssembler::Ldff1d;
10320 ld1_64_unscaled_offset_helper(kDRegSize, ld1d, ldff1d, false);
10321
10322 Ld1Macro ld1sb = &MacroAssembler::Ld1sb;
10323 Ld1Macro ldff1sb = &MacroAssembler::Ldff1sb;
10324 ld1_64_unscaled_offset_helper(kBRegSize, ld1sb, ldff1sb, true);
10325
10326 Ld1Macro ld1sh = &MacroAssembler::Ld1sh;
10327 Ld1Macro ldff1sh = &MacroAssembler::Ldff1sh;
10328 ld1_64_unscaled_offset_helper(kHRegSize, ld1sh, ldff1sh, true);
10329
10330 Ld1Macro ld1sw = &MacroAssembler::Ld1sw;
10331 Ld1Macro ldff1sw = &MacroAssembler::Ldff1sw;
10332 ld1_64_unscaled_offset_helper(kSRegSize, ld1sw, ldff1sw, true);
Jacob Bramleydcdbd752020-01-20 11:47:36 +000010333}
10334
Martyn Capewell72765d12020-03-23 14:25:53 +000010335TEST_SVE(sve_ldnt1) {
10336 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10337 START();
10338
10339 int data_size = kZRegMaxSizeInBytes * 16;
10340 uint8_t* data = new uint8_t[data_size];
10341 for (int i = 0; i < data_size; i++) {
10342 data[i] = i & 0xff;
10343 }
10344
10345 // Set the base half-way through the buffer so we can use negative indices.
10346 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10347 __ Ptrue(p0.VnB());
10348 __ Punpklo(p1.VnH(), p0.VnB());
10349 __ Punpklo(p2.VnH(), p1.VnB());
10350 __ Punpklo(p3.VnH(), p2.VnB());
10351 __ Punpklo(p4.VnH(), p3.VnB());
10352
10353 __ Mov(x1, 42);
10354 __ Ld1b(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10355 __ Ldnt1b(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10356
10357 __ Mov(x1, -21);
10358 __ Ld1h(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10359 __ Ldnt1h(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10360
10361 __ Mov(x1, 10);
10362 __ Ld1w(z4.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10363 __ Ldnt1w(z5.VnS(), p3.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10364
10365 __ Mov(x1, -5);
10366 __ Ld1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10367 __ Ldnt1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10368
10369 __ Ld1b(z8.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10370 __ Ldnt1b(z9.VnB(), p1.Zeroing(), SVEMemOperand(x0, 1, SVE_MUL_VL));
10371
10372 __ Ld1h(z10.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10373 __ Ldnt1h(z11.VnH(), p2.Zeroing(), SVEMemOperand(x0, -1, SVE_MUL_VL));
10374
10375 __ Ld1w(z12.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10376 __ Ldnt1w(z13.VnS(), p3.Zeroing(), SVEMemOperand(x0, 7, SVE_MUL_VL));
10377
10378 __ Ld1d(z14.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10379 __ Ldnt1d(z15.VnD(), p4.Zeroing(), SVEMemOperand(x0, -8, SVE_MUL_VL));
10380 END();
10381
10382 if (CAN_RUN()) {
10383 RUN();
10384 ASSERT_EQUAL_SVE(z0, z1);
10385 ASSERT_EQUAL_SVE(z2, z3);
10386 ASSERT_EQUAL_SVE(z4, z5);
10387 ASSERT_EQUAL_SVE(z6, z7);
10388 ASSERT_EQUAL_SVE(z8, z9);
10389 ASSERT_EQUAL_SVE(z10, z11);
10390 ASSERT_EQUAL_SVE(z12, z13);
10391 ASSERT_EQUAL_SVE(z14, z15);
10392 }
10393}
10394
Martyn Capewell3e2fb502020-03-24 12:04:07 +000010395TEST_SVE(sve_stnt1) {
10396 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10397 START();
10398
10399 int data_size = kZRegMaxSizeInBytes * 16;
10400 uint8_t* data = new uint8_t[data_size];
10401
10402 // Set the base half-way through the buffer so we can use negative indices.
10403 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10404 __ Ptrue(p0.VnB());
10405 __ Punpklo(p1.VnH(), p0.VnB());
10406 __ Punpklo(p2.VnH(), p1.VnB());
10407 __ Punpklo(p3.VnH(), p2.VnB());
10408 __ Punpklo(p4.VnH(), p3.VnB());
10409 __ Dup(z0.VnB(), 0x55);
10410 __ Index(z1.VnB(), 0, 1);
10411
10412 // Store with all-true and patterned predication, load back, and create a
10413 // reference value for later comparison.
10414 __ Rdvl(x1, 1);
10415 __ Stnt1b(z0.VnB(), p0, SVEMemOperand(x0, x1));
10416 __ Stnt1b(z1.VnB(), p1, SVEMemOperand(x0, 1, SVE_MUL_VL));
10417 __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1));
10418 __ Sel(z3.VnB(), p1, z1.VnB(), z0.VnB());
10419
10420 // Repeated, with wider elements and different offsets.
10421 __ Rdvl(x1, -1);
10422 __ Lsr(x1, x1, 1);
10423 __ Stnt1h(z0.VnH(), p0, SVEMemOperand(x0, x1, LSL, 1));
10424 __ Stnt1h(z1.VnH(), p2, SVEMemOperand(x0, -1, SVE_MUL_VL));
10425 __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
10426 __ Sel(z5.VnH(), p2, z1.VnH(), z0.VnH());
10427
10428 __ Rdvl(x1, 7);
10429 __ Lsr(x1, x1, 2);
10430 __ Stnt1w(z0.VnS(), p0, SVEMemOperand(x0, x1, LSL, 2));
10431 __ Stnt1w(z1.VnS(), p3, SVEMemOperand(x0, 7, SVE_MUL_VL));
10432 __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
10433 __ Sel(z7.VnS(), p3, z1.VnS(), z0.VnS());
10434
10435 __ Rdvl(x1, -8);
10436 __ Lsr(x1, x1, 3);
10437 __ Stnt1d(z0.VnD(), p0, SVEMemOperand(x0, x1, LSL, 3));
10438 __ Stnt1d(z1.VnD(), p4, SVEMemOperand(x0, -8, SVE_MUL_VL));
10439 __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
10440 __ Sel(z9.VnD(), p4, z1.VnD(), z0.VnD());
10441 END();
10442
10443 if (CAN_RUN()) {
10444 RUN();
10445 ASSERT_EQUAL_SVE(z2, z3);
10446 ASSERT_EQUAL_SVE(z4, z5);
10447 ASSERT_EQUAL_SVE(z6, z7);
10448 ASSERT_EQUAL_SVE(z8, z9);
10449 }
10450}
10451
Martyn Capewell452ad8b2020-03-19 15:49:57 +000010452TEST_SVE(sve_ld1rq) {
10453 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10454 START();
10455
10456 int data_size = (kQRegSizeInBytes + 128) * 2;
10457 uint8_t* data = new uint8_t[data_size];
10458 for (int i = 0; i < data_size; i++) {
10459 data[i] = i & 0xff;
10460 }
10461
10462 // Set the base half-way through the buffer so we can use negative indices.
10463 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
10464
10465 __ Index(z0.VnB(), 0, 1);
10466 __ Ptrue(p0.VnB());
10467 __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
10468 __ Pfalse(p1.VnB());
10469 __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
10470
10471 // Load and broadcast using scalar offsets.
10472 __ Mov(x1, -42);
10473 __ Ld1rqb(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
10474
10475 __ Add(x2, x0, 1);
10476 __ Mov(x1, -21);
10477 __ Punpklo(p2.VnH(), p1.VnB());
10478 __ Ld1rqh(z1.VnH(), p2.Zeroing(), SVEMemOperand(x2, x1, LSL, 1));
10479
10480 __ Add(x2, x2, 1);
10481 __ Mov(x1, -10);
10482 __ Punpklo(p3.VnH(), p2.VnB());
10483 __ Ld1rqw(z2.VnS(), p3.Zeroing(), SVEMemOperand(x2, x1, LSL, 2));
10484
10485 __ Add(x2, x2, 1);
10486 __ Mov(x1, 5);
10487 __ Punpklo(p4.VnH(), p3.VnB());
10488 __ Ld1rqd(z3.VnD(), p4.Zeroing(), SVEMemOperand(x2, x1, LSL, 3));
10489
10490 // Check that all segments match by rotating the vector by one segment,
10491 // eoring, and orring across the vector.
Martyn Capewellbebdfeb2021-03-04 17:31:19 +000010492 __ Mov(z4, z0);
10493 __ Ext(z4.VnB(), z4.VnB(), z4.VnB(), 16);
Martyn Capewell452ad8b2020-03-19 15:49:57 +000010494 __ Eor(z4.VnB(), z4.VnB(), z0.VnB());
10495 __ Orv(b4, p0, z4.VnB());
Martyn Capewellbebdfeb2021-03-04 17:31:19 +000010496 __ Mov(z5, z1);
10497 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
Martyn Capewell452ad8b2020-03-19 15:49:57 +000010498 __ Eor(z5.VnB(), z5.VnB(), z1.VnB());
10499 __ Orv(b5, p0, z5.VnB());
10500 __ Orr(z4, z4, z5);
Martyn Capewellbebdfeb2021-03-04 17:31:19 +000010501 __ Mov(z5, z2);
10502 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
Martyn Capewell452ad8b2020-03-19 15:49:57 +000010503 __ Eor(z5.VnB(), z5.VnB(), z2.VnB());
10504 __ Orv(b5, p0, z5.VnB());
10505 __ Orr(z4, z4, z5);
Martyn Capewellbebdfeb2021-03-04 17:31:19 +000010506 __ Mov(z5, z3);
10507 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 16);
Martyn Capewell452ad8b2020-03-19 15:49:57 +000010508 __ Eor(z5.VnB(), z5.VnB(), z3.VnB());
10509 __ Orv(b5, p0, z5.VnB());
10510 __ Orr(z4, z4, z5);
10511
10512 // Load and broadcast the same values, using immediate offsets.
10513 __ Add(x1, x0, 6);
10514 __ Ld1rqb(z5.VnB(), p1.Zeroing(), SVEMemOperand(x1, -48));
10515 __ Add(x1, x0, -9);
10516 __ Ld1rqh(z6.VnH(), p2.Zeroing(), SVEMemOperand(x1, -32));
10517 __ Add(x1, x0, -70);
10518 __ Ld1rqw(z7.VnS(), p3.Zeroing(), SVEMemOperand(x1, 32));
10519 __ Add(x1, x0, 27);
10520 __ Ld1rqd(z8.VnD(), p4.Zeroing(), SVEMemOperand(x1, 16));
10521 END();
10522
10523 if (CAN_RUN()) {
10524 RUN();
10525 uint64_t expected_z0[] = {0x0000000000000000, 0x006c006a00680066};
10526 uint64_t expected_z1[] = {0x000074730000706f, 0x00006c6b00006867};
10527 uint64_t expected_z2[] = {0x0000000075747372, 0x000000006d6c6b6a};
10528 uint64_t expected_z3[] = {0x0000000000000000, 0xc2c1c0bfbebdbcbb};
10529 uint64_t expected_z4[] = {0, 0};
10530 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
10531 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
10532 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
10533 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
10534 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
10535 ASSERT_EQUAL_SVE(z0, z5);
10536 ASSERT_EQUAL_SVE(z1, z6);
10537 ASSERT_EQUAL_SVE(z2, z7);
10538 ASSERT_EQUAL_SVE(z3, z8);
10539 }
10540}
10541
Martyn Capewellb56cf222020-05-05 17:38:28 +010010542TEST_SVE(sve_st1_vec_imm) {
10543 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
10544 START();
10545
10546 // TODO: Use mmap() to request a buffer in the low 4GB, which allows testing
10547 // 32-bit address vectors.
10548 int data_size = kZRegMaxSizeInBytes * 16;
10549 uint8_t* data = new uint8_t[data_size];
10550
10551 // Set the base to 16 bytes from the end of the buffer so we can use negative
10552 // indices.
10553 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size - 16]));
10554 __ Ptrue(p0.VnB());
10555
10556 // Store a vector of index values in reverse order, using
10557 // vector-plus-immediate addressing to begin at byte 15, then storing to
10558 // bytes 14, 13, etc.
10559 __ Index(z1.VnD(), x0, -1);
10560 __ Index(z2.VnD(), 0, 1);
10561
10562 // Iterate in order to store at least 16 bytes. The number of iterations
10563 // depends on VL, eg. VL128 iterates eight times, storing bytes 15 and 14
10564 // on the first iteration, 13 and 12 on the next, etc.
10565 uint64_t dlanes = config->sve_vl_in_bytes() / kDRegSizeInBytes;
10566 for (int i = 15; i >= 0; i -= dlanes * kBRegSizeInBytes) {
10567 __ St1b(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10568 __ Incd(z2.VnD());
10569 }
10570
10571 // Reload the stored data, and build a reference for comparison. The reference
10572 // is truncated to a Q register, as only the least-significant 128 bits are
10573 // checked.
10574 __ Ldr(q4, MemOperand(x0));
10575 __ Index(z5.VnB(), 15, -1);
10576 __ Mov(q5, q5);
10577
10578 // Repeat for wider elements.
10579 __ Index(z1.VnD(), x0, -2); // Stepping by -2 for H-sized elements.
10580 __ Index(z2.VnD(), 0, 1);
10581 for (int i = 14; i >= 0; i -= dlanes * kHRegSizeInBytes) {
10582 __ St1h(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10583 __ Incd(z2.VnD());
10584 }
10585 __ Ldr(q6, MemOperand(x0));
10586 __ Index(z7.VnH(), 7, -1);
10587 __ Mov(q7, q7);
10588
10589 __ Index(z1.VnD(), x0, -4); // Stepping by -4 for S-sized elements.
10590 __ Index(z2.VnD(), 0, 1);
10591 for (int i = 12; i >= 0; i -= dlanes * kSRegSizeInBytes) {
10592 __ St1w(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10593 __ Incd(z2.VnD());
10594 }
10595 __ Ldr(q8, MemOperand(x0));
10596 __ Index(z9.VnS(), 3, -1);
10597 __ Mov(q9, q9);
10598
10599 __ Index(z1.VnD(), x0, -8); // Stepping by -8 for D-sized elements.
10600 __ Index(z2.VnD(), 0, 1);
10601 for (int i = 8; i >= 0; i -= dlanes * kDRegSizeInBytes) {
10602 __ St1d(z2.VnD(), p0, SVEMemOperand(z1.VnD(), i));
10603 __ Incd(z2.VnD());
10604 }
10605 __ Ldr(q10, MemOperand(x0));
10606 __ Index(z11.VnD(), 1, -1);
10607 __ Mov(q11, q11);
10608
10609 // Test predication by storing even halfwords to memory (using predication)
10610 // at byte-separated addresses. The result should be the same as storing
10611 // even halfwords contiguously to memory.
10612 __ Pfalse(p1.VnB());
10613 __ Zip1(p1.VnD(), p0.VnD(), p1.VnD());
10614 __ Mov(x0, reinterpret_cast<uintptr_t>(data));
10615 __ Index(z1.VnD(), x0, 1);
10616 __ Index(z2.VnD(), 0x1000, 1);
10617 for (int i = 0; i < 16; i += dlanes) {
10618 __ St1h(z2.VnD(), p1, SVEMemOperand(z1.VnD(), i));
10619 __ Incd(z2.VnD());
10620 }
10621 __ Ldr(q2, MemOperand(x0));
10622 __ Index(z3.VnH(), 0x1000, 2);
10623 __ Mov(q3, q3);
10624
10625 END();
10626
10627 if (CAN_RUN()) {
10628 RUN();
10629
10630 ASSERT_EQUAL_SVE(z3, z2);
10631 ASSERT_EQUAL_SVE(z5, z4);
10632 ASSERT_EQUAL_SVE(z7, z6);
10633 ASSERT_EQUAL_SVE(z9, z8);
10634 ASSERT_EQUAL_SVE(z11, z10);
10635 }
10636}
10637
TatWai Chong5f3928c2020-06-11 00:09:20 -070010638template <typename T>
10639static void sve_st1_scalar_plus_vector_helper(Test* config,
10640 int esize_in_bits,
10641 T mod,
10642 bool is_scaled) {
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010643 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10644 START();
10645
10646 int vl = config->sve_vl_in_bytes();
10647 int data_size = vl * 160;
10648 uint8_t* data = new uint8_t[data_size];
10649 memset(data, 0, data_size);
TatWai Chong5f3928c2020-06-11 00:09:20 -070010650 int vl_per_esize = vl / (esize_in_bits / kBitsPerByte);
10651
10652 ZRegister zn_b = z0.WithLaneSize(esize_in_bits);
10653 ZRegister zn_h = z1.WithLaneSize(esize_in_bits);
10654 ZRegister zn_s = z2.WithLaneSize(esize_in_bits);
10655 ZRegister zn_d = z3.WithLaneSize(esize_in_bits);
10656
10657 ZRegister zn_ld_b = z10.WithLaneSize(esize_in_bits);
10658 ZRegister zn_ld_h = z11.WithLaneSize(esize_in_bits);
10659 ZRegister zn_ld_s = z12.WithLaneSize(esize_in_bits);
10660 ZRegister zn_ld_d = z13.WithLaneSize(esize_in_bits);
10661 ZRegister offsets = z31.WithLaneSize(esize_in_bits);
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010662
10663 // Set the base half-way through the buffer so we can use negative indices.
10664 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
TatWai Chong5f3928c2020-06-11 00:09:20 -070010665 __ Ptrue(p6.WithLaneSize(esize_in_bits));
10666 __ Pfalse(p7.WithLaneSize(esize_in_bits));
10667 __ Zip1(p0.WithLaneSize(esize_in_bits),
10668 p6.WithLaneSize(esize_in_bits),
10669 p7.WithLaneSize(esize_in_bits));
10670 __ Zip1(p1.WithLaneSize(esize_in_bits),
10671 p7.WithLaneSize(esize_in_bits),
10672 p6.WithLaneSize(esize_in_bits));
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010673
TatWai Chong5f3928c2020-06-11 00:09:20 -070010674 // `st1b` doesn't have the scaled-offset forms.
10675 if (is_scaled == false) {
10676 // Simply stepping the index by 2 to simulate a scatter memory access.
10677 __ Index(offsets, 1, 2);
10678 __ St1b(offsets, p0, SVEMemOperand(x0, offsets, mod));
10679 __ Ld1b(zn_ld_b, p0.Zeroing(), SVEMemOperand(x0, offsets, mod));
10680 __ Dup(zn_b, 0);
10681 __ Mov(zn_b, p0.Merging(), offsets);
10682 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010683
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010684 // Store the values to isolated range different with other stores.
TatWai Chong5f3928c2020-06-11 00:09:20 -070010685 int scale = is_scaled ? 1 : 0;
10686 __ Add(x1, x0, vl_per_esize * 4);
10687 __ Index(offsets, 6, 4);
10688 __ St1h(offsets, p0, SVEMemOperand(x1, offsets, mod, scale));
10689 __ Ld1h(zn_ld_h, p0.Zeroing(), SVEMemOperand(x1, offsets, mod, scale));
10690 __ Dup(zn_h, 0);
10691 __ Mov(zn_h, p0.Merging(), offsets);
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010692
TatWai Chong5f3928c2020-06-11 00:09:20 -070010693 scale = is_scaled ? 2 : 0;
10694 __ Add(x2, x0, UINT64_MAX + (vl_per_esize * -8) + 1);
10695 __ Index(offsets, 64, 8);
10696 if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10697 (static_cast<int>(mod) == SXTW)) {
10698 // Testing negative offsets.
10699 __ Neg(offsets, p6.Merging(), offsets);
10700 }
10701 __ St1w(offsets, p1, SVEMemOperand(x2, offsets, mod, scale));
10702 __ Ld1w(zn_ld_s, p1.Zeroing(), SVEMemOperand(x2, offsets, mod, scale));
10703 __ Dup(zn_s, 0);
10704 __ Mov(zn_s, p1.Merging(), offsets);
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010705
TatWai Chong5f3928c2020-06-11 00:09:20 -070010706 if (esize_in_bits == kDRegSize) {
10707 // Test st1w by comparing the 32-bit value loaded correspondingly with the
10708 // 32-bit value stored.
10709 __ Lsl(zn_s, zn_s, kSRegSize);
10710 __ Lsr(zn_s, zn_s, kSRegSize);
10711 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010712
TatWai Chong5f3928c2020-06-11 00:09:20 -070010713 // `st1d` doesn't have the S-sized lane forms.
10714 if (esize_in_bits == kDRegSize) {
10715 scale = is_scaled ? 3 : 0;
10716 __ Add(x3, x0, UINT64_MAX + (vl_per_esize * -16) + 1);
10717 __ Index(offsets, 128, 16);
10718 if ((std::is_same<T, vixl::aarch64::Extend>::value) &&
10719 (static_cast<int>(mod) == SXTW)) {
10720 __ Neg(offsets, p6.Merging(), offsets);
10721 }
10722 __ St1d(offsets, p1, SVEMemOperand(x3, offsets, mod, scale));
10723 __ Ld1d(zn_ld_d, p1.Zeroing(), SVEMemOperand(x3, offsets, mod, scale));
10724 __ Dup(zn_d, 0);
10725 __ Mov(zn_d, p1.Merging(), offsets);
10726 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010727
10728 END();
10729
10730 if (CAN_RUN()) {
10731 RUN();
10732
TatWai Chong5f3928c2020-06-11 00:09:20 -070010733 if (scale == false) {
10734 ASSERT_EQUAL_SVE(zn_ld_b, zn_b);
10735 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010736
TatWai Chong5f3928c2020-06-11 00:09:20 -070010737 ASSERT_EQUAL_SVE(zn_ld_h, zn_h);
10738 ASSERT_EQUAL_SVE(zn_ld_s, zn_s);
10739
10740 if (esize_in_bits == kDRegSize) {
10741 ASSERT_EQUAL_SVE(zn_ld_d, zn_d);
10742 }
Martyn Capewellfa098bc2020-05-12 10:21:56 +010010743 }
10744
10745 delete[] data;
10746}
10747
TatWai Chong5f3928c2020-06-11 00:09:20 -070010748TEST_SVE(sve_st1_sca_vec_32_unpacked_unscaled) {
10749 sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, false);
10750 sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, false);
10751}
10752
10753TEST_SVE(sve_st1_sca_vec_32_unpacked_scaled) {
10754 sve_st1_scalar_plus_vector_helper(config, kDRegSize, UXTW, true);
10755 sve_st1_scalar_plus_vector_helper(config, kDRegSize, SXTW, true);
10756}
10757
10758TEST_SVE(sve_st1_sca_vec_32_unscaled) {
10759 sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, false);
10760 sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, false);
10761}
10762
10763TEST_SVE(sve_st1_sca_vec_32_scaled) {
10764 sve_st1_scalar_plus_vector_helper(config, kSRegSize, UXTW, true);
10765 sve_st1_scalar_plus_vector_helper(config, kSRegSize, SXTW, true);
10766}
10767
10768TEST_SVE(sve_st1_sca_vec_64_scaled) {
10769 sve_st1_scalar_plus_vector_helper(config, kDRegSize, LSL, true);
10770}
10771
10772TEST_SVE(sve_st1_sca_vec_64_unscaled) {
10773 sve_st1_scalar_plus_vector_helper(config, kDRegSize, NO_SHIFT, false);
10774}
10775
TatWai Chong6995bfd2019-09-26 10:48:05 +010010776typedef void (MacroAssembler::*IntWideImmFn)(const ZRegister& zd,
10777 const ZRegister& zn,
10778 const IntegerOperand imm);
10779
10780template <typename F, typename Td, typename Tn>
10781static void IntWideImmHelper(Test* config,
10782 F macro,
10783 unsigned lane_size_in_bits,
10784 const Tn& zn_inputs,
10785 IntegerOperand imm,
10786 const Td& zd_expected) {
10787 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
10788 START();
10789
10790 ZRegister zd1 = z0.WithLaneSize(lane_size_in_bits);
10791 InsrHelper(&masm, zd1, zn_inputs);
10792
10793 // Also test with a different zn, to test the movprfx case.
10794 ZRegister zn = z1.WithLaneSize(lane_size_in_bits);
10795 InsrHelper(&masm, zn, zn_inputs);
10796 ZRegister zd2 = z2.WithLaneSize(lane_size_in_bits);
10797 ZRegister zn_copy = z3.WithSameLaneSizeAs(zn);
10798
10799 // Make a copy so we can check that constructive operations preserve zn.
10800 __ Mov(zn_copy, zn);
10801
10802 {
10803 UseScratchRegisterScope temps(&masm);
10804 // The MacroAssembler needs a P scratch register for some of these macros,
10805 // and it doesn't have one by default.
10806 temps.Include(p3);
10807
10808 (masm.*macro)(zd1, zd1, imm);
10809 (masm.*macro)(zd2, zn, imm);
10810 }
10811
10812 END();
10813
10814 if (CAN_RUN()) {
10815 RUN();
10816
10817 ASSERT_EQUAL_SVE(zd_expected, zd1);
10818
10819 // Check the result from `instr` with movprfx is the same as
10820 // the immediate version.
10821 ASSERT_EQUAL_SVE(zd_expected, zd2);
10822
10823 ASSERT_EQUAL_SVE(zn_copy, zn);
10824 }
10825}
10826
10827TEST_SVE(sve_int_wide_imm_unpredicated_smax) {
10828 int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10829 int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10830 int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10831 int64_t in_d[] = {1, 10, 10000, 1000000};
10832
10833 IntWideImmFn fn = &MacroAssembler::Smax;
10834
10835 int exp_b_1[] = {0, -1, 127, -1, 126, 1, -1, 55};
10836 int exp_h_1[] = {127, 127, 127, 127, INT16_MAX, 127, 127, 5555};
10837 int exp_s_1[] = {0, -128, 127, -128, INT32_MAX, 1, -1, 555555};
10838 int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10839
10840 IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10841 IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10842 IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10843 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10844
10845 int exp_h_2[] = {0, -128, 127, -255, INT16_MAX, 1, -1, 5555};
10846 int exp_s_2[] = {2048, 2048, 2048, 2048, INT32_MAX, 2048, 2048, 555555};
10847 int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10848
10849 // The immediate is in the range [-128, 127], but the macro is able to
10850 // synthesise unencodable immediates.
10851 // B-sized lanes cannot take an immediate out of the range [-128, 127].
10852 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10853 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10854 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10855}
10856
10857TEST_SVE(sve_int_wide_imm_unpredicated_smin) {
10858 int in_b[] = {0, -128, 127, -127, 126, 1, -1, 55};
10859 int in_h[] = {0, -128, 127, INT16_MIN, INT16_MAX, 1, -1, 5555};
10860 int in_s[] = {0, -128, 127, INT32_MIN, INT32_MAX, 1, -1, 555555};
10861 int64_t in_d[] = {1, 10, 10000, 1000000};
10862
10863 IntWideImmFn fn = &MacroAssembler::Smin;
10864
10865 int exp_b_1[] = {-1, -128, -1, -127, -1, -1, -1, -1};
10866 int exp_h_1[] = {0, -128, 127, INT16_MIN, 127, 1, -1, 127};
10867 int exp_s_1[] = {-128, -128, -128, INT32_MIN, -128, -128, -128, -128};
10868 int64_t exp_d_1[] = {1, 10, 99, 99};
10869
10870 IntWideImmHelper(config, fn, kBRegSize, in_b, -1, exp_b_1);
10871 IntWideImmHelper(config, fn, kHRegSize, in_h, 127, exp_h_1);
10872 IntWideImmHelper(config, fn, kSRegSize, in_s, -128, exp_s_1);
10873 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10874
10875 int exp_h_2[] = {-255, -255, -255, INT16_MIN, -255, -255, -255, -255};
10876 int exp_s_2[] = {0, -128, 127, INT32_MIN, 2048, 1, -1, 2048};
10877 int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10878
10879 // The immediate is in the range [-128, 127], but the macro is able to
10880 // synthesise unencodable immediates.
10881 // B-sized lanes cannot take an immediate out of the range [-128, 127].
10882 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10883 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10884 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10885}
10886
10887TEST_SVE(sve_int_wide_imm_unpredicated_umax) {
10888 int in_b[] = {0, 255, 127, 0x80, 1, 55};
10889 int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10890 int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10891 int64_t in_d[] = {1, 10, 10000, 1000000};
10892
10893 IntWideImmFn fn = &MacroAssembler::Umax;
10894
10895 int exp_b_1[] = {17, 255, 127, 0x80, 17, 55};
10896 int exp_h_1[] = {127, 255, 127, INT16_MAX, 127, 5555};
10897 int exp_s_1[] = {255, 255, 255, INT32_MAX, 255, 555555};
10898 int64_t exp_d_1[] = {99, 99, 10000, 1000000};
10899
10900 IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10901 IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10902 IntWideImmHelper(config, fn, kSRegSize, in_s, 0xff, exp_s_1);
10903 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10904
10905 int exp_h_2[] = {511, 511, 511, INT16_MAX, 511, 5555};
10906 int exp_s_2[] = {2048, 2048, 2048, INT32_MAX, 2048, 555555};
10907 int64_t exp_d_2[] = {INT16_MAX, INT16_MAX, INT16_MAX, 1000000};
10908
10909 // The immediate is in the range [0, 255], but the macro is able to
10910 // synthesise unencodable immediates.
10911 // B-sized lanes cannot take an immediate out of the range [0, 255].
10912 IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10913 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10914 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10915}
10916
10917TEST_SVE(sve_int_wide_imm_unpredicated_umin) {
10918 int in_b[] = {0, 255, 127, 0x80, 1, 55};
10919 int in_h[] = {0, 255, 127, INT16_MAX, 1, 5555};
10920 int in_s[] = {0, 0xff, 0x7f, INT32_MAX, 1, 555555};
10921 int64_t in_d[] = {1, 10, 10000, 1000000};
10922
10923 IntWideImmFn fn = &MacroAssembler::Umin;
10924
10925 int exp_b_1[] = {0, 17, 17, 17, 1, 17};
10926 int exp_h_1[] = {0, 127, 127, 127, 1, 127};
10927 int exp_s_1[] = {0, 255, 127, 255, 1, 255};
10928 int64_t exp_d_1[] = {1, 10, 99, 99};
10929
10930 IntWideImmHelper(config, fn, kBRegSize, in_b, 17, exp_b_1);
10931 IntWideImmHelper(config, fn, kHRegSize, in_h, 0x7f, exp_h_1);
10932 IntWideImmHelper(config, fn, kSRegSize, in_s, 255, exp_s_1);
10933 IntWideImmHelper(config, fn, kDRegSize, in_d, 99, exp_d_1);
10934
10935 int exp_h_2[] = {0, 255, 127, 511, 1, 511};
10936 int exp_s_2[] = {0, 255, 127, 2048, 1, 2048};
10937 int64_t exp_d_2[] = {1, 10, 10000, INT16_MAX};
10938
10939 // The immediate is in the range [0, 255], but the macro is able to
10940 // synthesise unencodable immediates.
10941 // B-sized lanes cannot take an immediate out of the range [0, 255].
10942 IntWideImmHelper(config, fn, kHRegSize, in_h, 511, exp_h_2);
10943 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10944 IntWideImmHelper(config, fn, kDRegSize, in_d, INT16_MAX, exp_d_2);
10945}
10946
10947TEST_SVE(sve_int_wide_imm_unpredicated_mul) {
10948 int in_b[] = {11, -1, 7, -3};
10949 int in_h[] = {111, -1, 17, -123};
10950 int in_s[] = {11111, -1, 117, -12345};
10951 int64_t in_d[] = {0x7fffffff, 0x80000000};
10952
10953 IntWideImmFn fn = &MacroAssembler::Mul;
10954
10955 int exp_b_1[] = {66, -6, 42, -18};
10956 int exp_h_1[] = {-14208, 128, -2176, 15744};
10957 int exp_s_1[] = {11111 * 127, -127, 117 * 127, -12345 * 127};
10958 int64_t exp_d_1[] = {0xfffffffe, 0x100000000};
10959
10960 IntWideImmHelper(config, fn, kBRegSize, in_b, 6, exp_b_1);
10961 IntWideImmHelper(config, fn, kHRegSize, in_h, -128, exp_h_1);
10962 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
10963 IntWideImmHelper(config, fn, kDRegSize, in_d, 2, exp_d_1);
10964
10965 int exp_h_2[] = {-28305, 255, -4335, 31365};
10966 int exp_s_2[] = {22755328, -2048, 239616, -25282560};
10967 int64_t exp_d_2[] = {0x00000063ffffff38, 0x0000006400000000};
10968
10969 // The immediate is in the range [-128, 127], but the macro is able to
10970 // synthesise unencodable immediates.
10971 // B-sized lanes cannot take an immediate out of the range [0, 255].
10972 IntWideImmHelper(config, fn, kHRegSize, in_h, -255, exp_h_2);
10973 IntWideImmHelper(config, fn, kSRegSize, in_s, 2048, exp_s_2);
10974 IntWideImmHelper(config, fn, kDRegSize, in_d, 200, exp_d_2);
10975
10976 // Integer overflow on multiplication.
10977 unsigned exp_b_3[] = {0x75, 0x81, 0x79, 0x83};
10978
10979 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x7f, exp_b_3);
10980}
10981
10982TEST_SVE(sve_int_wide_imm_unpredicated_add) {
10983 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
10984 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
10985 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
10986 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
10987
10988 IntWideImmFn fn = &MacroAssembler::Add;
10989
10990 unsigned exp_b_1[] = {0x02, 0x00, 0x91, 0x80};
10991 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
10992 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
10993 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
10994
10995 // Encodable with `add` (shift 0).
10996 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
10997 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
10998 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
10999 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11000
11001 unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
11002 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11003 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11004
11005 // Encodable with `add` (shift 8).
11006 // B-sized lanes cannot take a shift of 8.
11007 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11008 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11009 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11010
11011 unsigned exp_s_3[] = {0x80808181, 0x807e7f7f, 0xab29aaaa, 0xf07ff0f0};
11012
11013 // The macro is able to synthesise unencodable immediates.
11014 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
Jacob Bramleyd9f929c2019-10-02 11:42:56 +010011015
11016 unsigned exp_b_4[] = {0x61, 0x5f, 0xf0, 0xdf};
11017 unsigned exp_h_4[] = {0x6181, 0x5f7f, 0xf010, 0x8aaa};
11018 unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11019 uint64_t exp_d_4[] = {0x8000000180018180, 0x7fffffff7fff7f7e};
11020
11021 // Negative immediates use `sub`.
11022 IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11023 IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11024 IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11025 IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011026}
11027
11028TEST_SVE(sve_int_wide_imm_unpredicated_sqadd) {
11029 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11030 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11031 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11032 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11033
11034 IntWideImmFn fn = &MacroAssembler::Sqadd;
11035
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011036 unsigned exp_b_1[] = {0x02, 0x7f, 0x7f, 0x7f};
TatWai Chong6995bfd2019-09-26 10:48:05 +010011037 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11038 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11039 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11040
11041 // Encodable with `sqadd` (shift 0).
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011042 // Note that encodable immediates are unsigned, even for signed saturation.
11043 IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011044 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11045 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011046 IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011047
11048 unsigned exp_h_2[] = {0x9181, 0x7fff, 0x2010, 0xbaaa};
11049 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11050 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11051
11052 // Encodable with `sqadd` (shift 8).
11053 // B-sized lanes cannot take a shift of 8.
11054 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11055 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11056 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011057}
11058
11059TEST_SVE(sve_int_wide_imm_unpredicated_uqadd) {
11060 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11061 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11062 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11063 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11064
11065 IntWideImmFn fn = &MacroAssembler::Uqadd;
11066
11067 unsigned exp_b_1[] = {0xff, 0xff, 0x91, 0xff};
11068 unsigned exp_h_1[] = {0x8191, 0x7f8f, 0x1020, 0xaaba};
11069 unsigned exp_s_1[] = {0x80018200, 0x7fff7ffe, 0xaaaaab29, 0xf000f16f};
11070 uint64_t exp_d_1[] = {0x8000000180018280, 0x7fffffff7fff807e};
11071
11072 // Encodable with `uqadd` (shift 0).
11073 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11074 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11075 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11076 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11077
11078 unsigned exp_h_2[] = {0x9181, 0x8f7f, 0x2010, 0xbaaa};
11079 unsigned exp_s_2[] = {0x80020081, 0x7ffffe7f, 0xaaab29aa, 0xf0016ff0};
11080 uint64_t exp_d_2[] = {0x8000000180028081, 0x7fffffff80007e7f};
11081
11082 // Encodable with `uqadd` (shift 8).
11083 // B-sized lanes cannot take a shift of 8.
11084 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11085 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11086 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011087}
11088
11089TEST_SVE(sve_int_wide_imm_unpredicated_sub) {
11090 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11091 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11092 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11093 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11094
11095 IntWideImmFn fn = &MacroAssembler::Sub;
11096
11097 unsigned exp_b_1[] = {0x00, 0xfe, 0x8f, 0x7e};
11098 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11099 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11100 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11101
11102 // Encodable with `sub` (shift 0).
11103 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11104 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11105 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11106 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11107
11108 unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11109 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11110 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11111
11112 // Encodable with `sub` (shift 8).
11113 // B-sized lanes cannot take a shift of 8.
11114 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11115 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11116 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
11117
11118 unsigned exp_s_3[] = {0x7f828181, 0x7f807f7f, 0xaa2baaaa, 0xef81f0f0};
11119
11120 // The macro is able to synthesise unencodable immediates.
11121 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 16, exp_s_3);
Jacob Bramleyd9f929c2019-10-02 11:42:56 +010011122
11123 unsigned exp_b_4[] = {0xa1, 0x9f, 0x30, 0x1f};
11124 unsigned exp_h_4[] = {0xa181, 0x9f7f, 0x3010, 0xcaaa};
11125 unsigned exp_s_4[] = {0x00018181, 0xffff7f7f, 0x2aaaaaaa, 0x7000f0f0};
11126 uint64_t exp_d_4[] = {0x8000000180018182, 0x7fffffff7fff7f80};
11127
11128 // Negative immediates use `add`.
11129 IntWideImmHelper(config, fn, kBRegSize, in_b, -0x20, exp_b_4);
11130 IntWideImmHelper(config, fn, kHRegSize, in_h, -0x2000, exp_h_4);
11131 IntWideImmHelper(config, fn, kSRegSize, in_s, INT32_MIN, exp_s_4);
11132 IntWideImmHelper(config, fn, kDRegSize, in_d, -1, exp_d_4);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011133}
11134
11135TEST_SVE(sve_int_wide_imm_unpredicated_sqsub) {
11136 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11137 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11138 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11139 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11140
11141 IntWideImmFn fn = &MacroAssembler::Sqsub;
11142
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011143 unsigned exp_b_1[] = {0x80, 0xfe, 0x8f, 0x80};
TatWai Chong6995bfd2019-09-26 10:48:05 +010011144 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11145 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11146 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11147
11148 // Encodable with `sqsub` (shift 0).
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011149 // Note that encodable immediates are unsigned, even for signed saturation.
11150 IntWideImmHelper(config, fn, kBRegSize, in_b, 129, exp_b_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011151 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11152 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
Jacob Bramleyb28f6172019-10-02 12:12:35 +010011153 IntWideImmHelper(config, fn, kDRegSize, in_d, 255, exp_d_1);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011154
11155 unsigned exp_h_2[] = {0x8000, 0x6f7f, 0x0010, 0x9aaa};
11156 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11157 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11158
11159 // Encodable with `sqsub` (shift 8).
11160 // B-sized lanes cannot take a shift of 8.
11161 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11162 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11163 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011164}
11165
11166TEST_SVE(sve_int_wide_imm_unpredicated_uqsub) {
11167 unsigned in_b[] = {0x81, 0x7f, 0x10, 0xff};
11168 unsigned in_h[] = {0x8181, 0x7f7f, 0x1010, 0xaaaa};
11169 unsigned in_s[] = {0x80018181, 0x7fff7f7f, 0xaaaaaaaa, 0xf000f0f0};
11170 uint64_t in_d[] = {0x8000000180018181, 0x7fffffff7fff7f7f};
11171
11172 IntWideImmFn fn = &MacroAssembler::Uqsub;
11173
11174 unsigned exp_b_1[] = {0x00, 0x00, 0x00, 0x7e};
11175 unsigned exp_h_1[] = {0x8171, 0x7f6f, 0x1000, 0xaa9a};
11176 unsigned exp_s_1[] = {0x80018102, 0x7fff7f00, 0xaaaaaa2b, 0xf000f071};
11177 uint64_t exp_d_1[] = {0x8000000180018082, 0x7fffffff7fff7e80};
11178
11179 // Encodable with `uqsub` (shift 0).
11180 IntWideImmHelper(config, fn, kBRegSize, in_b, 0x81, exp_b_1);
11181 IntWideImmHelper(config, fn, kHRegSize, in_h, 16, exp_h_1);
11182 IntWideImmHelper(config, fn, kSRegSize, in_s, 127, exp_s_1);
11183 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff, exp_d_1);
11184
11185 unsigned exp_h_2[] = {0x7181, 0x6f7f, 0x0010, 0x9aaa};
11186 unsigned exp_s_2[] = {0x80010281, 0x7fff007f, 0xaaaa2baa, 0xf00071f0};
11187 uint64_t exp_d_2[] = {0x8000000180008281, 0x7fffffff7ffe807f};
11188
11189 // Encodable with `uqsub` (shift 8).
11190 // B-sized lanes cannot take a shift of 8.
11191 IntWideImmHelper(config, fn, kHRegSize, in_h, 16 << 8, exp_h_2);
11192 IntWideImmHelper(config, fn, kSRegSize, in_s, 127 << 8, exp_s_2);
11193 IntWideImmHelper(config, fn, kDRegSize, in_d, 0xff << 8, exp_d_2);
TatWai Chong6995bfd2019-09-26 10:48:05 +010011194}
11195
11196TEST_SVE(sve_int_wide_imm_unpredicated_subr) {
11197 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11198 START();
11199
11200 // Encodable with `subr` (shift 0).
11201 __ Index(z0.VnD(), 1, 1);
11202 __ Sub(z0.VnD(), 100, z0.VnD());
11203 __ Index(z1.VnS(), 0x7f, 1);
11204 __ Sub(z1.VnS(), 0xf7, z1.VnS());
11205 __ Index(z2.VnH(), 0xaaaa, 0x2222);
11206 __ Sub(z2.VnH(), 0x80, z2.VnH());
11207 __ Index(z3.VnB(), 133, 1);
11208 __ Sub(z3.VnB(), 255, z3.VnB());
11209
11210 // Encodable with `subr` (shift 8).
11211 __ Index(z4.VnD(), 256, -1);
11212 __ Sub(z4.VnD(), 42 * 256, z4.VnD());
11213 __ Index(z5.VnS(), 0x7878, 1);
11214 __ Sub(z5.VnS(), 0x8000, z5.VnS());
11215 __ Index(z6.VnH(), 0x30f0, -1);
11216 __ Sub(z6.VnH(), 0x7f00, z6.VnH());
11217 // B-sized lanes cannot take a shift of 8.
11218
11219 // Select with movprfx.
11220 __ Index(z31.VnD(), 256, 4001);
11221 __ Sub(z7.VnD(), 42 * 256, z31.VnD());
11222
11223 // Out of immediate encodable range of `sub`.
11224 __ Index(z30.VnS(), 0x11223344, 1);
11225 __ Sub(z8.VnS(), 0x88776655, z30.VnS());
11226
11227 END();
11228
11229 if (CAN_RUN()) {
11230 RUN();
11231
11232 int expected_z0[] = {87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99};
11233 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
11234
11235 int expected_z1[] = {0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78};
11236 ASSERT_EQUAL_SVE(expected_z1, z1.VnS());
11237
11238 int expected_z2[] = {0xab2c, 0xcd4e, 0xef70, 0x1192, 0x33b4, 0x55d6};
11239 ASSERT_EQUAL_SVE(expected_z2, z2.VnH());
11240
11241 int expected_z3[] = {0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a};
11242 ASSERT_EQUAL_SVE(expected_z3, z3.VnB());
11243
11244 int expected_z4[] = {10502, 10501, 10500, 10499, 10498, 10497, 10496};
11245 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
11246
11247 int expected_z5[] = {0x0783, 0x0784, 0x0785, 0x0786, 0x0787, 0x0788};
11248 ASSERT_EQUAL_SVE(expected_z5, z5.VnS());
11249
11250 int expected_z6[] = {0x4e15, 0x4e14, 0x4e13, 0x4e12, 0x4e11, 0x4e10};
11251 ASSERT_EQUAL_SVE(expected_z6, z6.VnH());
11252
11253 int expected_z7[] = {-13510, -9509, -5508, -1507, 2494, 6495, 10496};
11254 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
11255
11256 int expected_z8[] = {0x7755330e, 0x7755330f, 0x77553310, 0x77553311};
11257 ASSERT_EQUAL_SVE(expected_z8, z8.VnS());
11258 }
11259}
11260
11261TEST_SVE(sve_int_wide_imm_unpredicated_fdup) {
11262 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11263 START();
11264
11265 // Immediates which can be encoded in the instructions.
11266 __ Fdup(z0.VnH(), RawbitsToFloat16(0xc500));
11267 __ Fdup(z1.VnS(), Float16(2.0));
11268 __ Fdup(z2.VnD(), Float16(3.875));
11269 __ Fdup(z3.VnH(), 8.0f);
11270 __ Fdup(z4.VnS(), -4.75f);
11271 __ Fdup(z5.VnD(), 0.5f);
11272 __ Fdup(z6.VnH(), 1.0);
11273 __ Fdup(z7.VnS(), 2.125);
11274 __ Fdup(z8.VnD(), -13.0);
11275
11276 // Immediates which cannot be encoded in the instructions.
11277 __ Fdup(z10.VnH(), Float16(0.0));
11278 __ Fdup(z11.VnH(), kFP16PositiveInfinity);
11279 __ Fdup(z12.VnS(), 255.0f);
11280 __ Fdup(z13.VnS(), kFP32NegativeInfinity);
11281 __ Fdup(z14.VnD(), 12.3456);
11282 __ Fdup(z15.VnD(), kFP64PositiveInfinity);
11283
11284 END();
11285
11286 if (CAN_RUN()) {
11287 RUN();
11288
11289 ASSERT_EQUAL_SVE(0xc500, z0.VnH());
11290 ASSERT_EQUAL_SVE(0x40000000, z1.VnS());
11291 ASSERT_EQUAL_SVE(0x400f000000000000, z2.VnD());
11292 ASSERT_EQUAL_SVE(0x4800, z3.VnH());
11293 ASSERT_EQUAL_SVE(FloatToRawbits(-4.75f), z4.VnS());
11294 ASSERT_EQUAL_SVE(DoubleToRawbits(0.5), z5.VnD());
11295 ASSERT_EQUAL_SVE(0x3c00, z6.VnH());
11296 ASSERT_EQUAL_SVE(FloatToRawbits(2.125f), z7.VnS());
11297 ASSERT_EQUAL_SVE(DoubleToRawbits(-13.0), z8.VnD());
11298
11299 ASSERT_EQUAL_SVE(0x0000, z10.VnH());
11300 ASSERT_EQUAL_SVE(Float16ToRawbits(kFP16PositiveInfinity), z11.VnH());
11301 ASSERT_EQUAL_SVE(FloatToRawbits(255.0), z12.VnS());
11302 ASSERT_EQUAL_SVE(FloatToRawbits(kFP32NegativeInfinity), z13.VnS());
11303 ASSERT_EQUAL_SVE(DoubleToRawbits(12.3456), z14.VnD());
11304 ASSERT_EQUAL_SVE(DoubleToRawbits(kFP64PositiveInfinity), z15.VnD());
11305 }
11306}
11307
TatWai Chong6f111bc2019-10-07 09:20:37 +010011308TEST_SVE(sve_andv_eorv_orv) {
11309 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11310 START();
11311
11312 uint64_t in[] = {0x8899aabbccddeeff, 0x7777555533331111, 0x123456789abcdef0};
11313 InsrHelper(&masm, z31.VnD(), in);
11314
11315 // For simplicity, we re-use the same pg for various lane sizes.
11316 // For D lanes: 1, 1, 0
11317 // For S lanes: 1, 1, 1, 0, 0
11318 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
11319 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11320 Initialise(&masm, p0.VnB(), pg_in);
11321
11322 // Make a copy so we can check that constructive operations preserve zn.
11323 __ Mov(z0, z31);
11324 __ Andv(b0, p0, z0.VnB()); // destructive
11325 __ Andv(h1, p0, z31.VnH());
11326 __ Mov(z2, z31);
11327 __ Andv(s2, p0, z2.VnS()); // destructive
11328 __ Andv(d3, p0, z31.VnD());
11329
11330 __ Eorv(b4, p0, z31.VnB());
11331 __ Mov(z5, z31);
11332 __ Eorv(h5, p0, z5.VnH()); // destructive
11333 __ Eorv(s6, p0, z31.VnS());
11334 __ Mov(z7, z31);
11335 __ Eorv(d7, p0, z7.VnD()); // destructive
11336
11337 __ Mov(z8, z31);
11338 __ Orv(b8, p0, z8.VnB()); // destructive
11339 __ Orv(h9, p0, z31.VnH());
11340 __ Mov(z10, z31);
11341 __ Orv(s10, p0, z10.VnS()); // destructive
11342 __ Orv(d11, p0, z31.VnD());
11343
11344 END();
11345
11346 if (CAN_RUN()) {
11347 RUN();
11348
11349 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11350 ASSERT_EQUAL_64(0x10, d0);
11351 ASSERT_EQUAL_64(0x1010, d1);
11352 ASSERT_EQUAL_64(0x33331111, d2);
11353 ASSERT_EQUAL_64(0x7777555533331111, d3);
11354 ASSERT_EQUAL_64(0xbf, d4);
11355 ASSERT_EQUAL_64(0xedcb, d5);
11356 ASSERT_EQUAL_64(0x44444444, d6);
11357 ASSERT_EQUAL_64(0x7777555533331111, d7);
11358 ASSERT_EQUAL_64(0xff, d8);
11359 ASSERT_EQUAL_64(0xffff, d9);
11360 ASSERT_EQUAL_64(0x77775555, d10);
11361 ASSERT_EQUAL_64(0x7777555533331111, d11);
11362 } else {
11363 ASSERT_EQUAL_64(0, d0);
11364 ASSERT_EQUAL_64(0x0010, d1);
11365 ASSERT_EQUAL_64(0x00110011, d2);
11366 ASSERT_EQUAL_64(0x0011001100110011, d3);
11367 ASSERT_EQUAL_64(0x62, d4);
11368 ASSERT_EQUAL_64(0x0334, d5);
11369 ASSERT_EQUAL_64(0x8899aabb, d6);
11370 ASSERT_EQUAL_64(0xffeeffeeffeeffee, d7);
11371 ASSERT_EQUAL_64(0xff, d8);
11372 ASSERT_EQUAL_64(0xffff, d9);
11373 ASSERT_EQUAL_64(0xffffffff, d10);
11374 ASSERT_EQUAL_64(0xffffffffffffffff, d11);
11375 }
11376
11377 // Check the upper lanes above the top of the V register are all clear.
11378 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11379 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11380 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11381 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11382 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11383 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11384 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11385 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11386 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11387 ASSERT_EQUAL_SVE_LANE(0, z8.VnD(), i);
11388 ASSERT_EQUAL_SVE_LANE(0, z9.VnD(), i);
11389 ASSERT_EQUAL_SVE_LANE(0, z10.VnD(), i);
11390 ASSERT_EQUAL_SVE_LANE(0, z11.VnD(), i);
11391 }
11392 }
11393}
11394
TatWai Chongb2d8d1f2019-10-21 15:19:31 -070011395
11396TEST_SVE(sve_saddv_uaddv) {
11397 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11398 START();
11399
11400 uint64_t in[] = {0x8899aabbccddeeff, 0x8182838485868788, 0x0807060504030201};
11401 InsrHelper(&masm, z31.VnD(), in);
11402
11403 // For simplicity, we re-use the same pg for various lane sizes.
11404 // For D lanes: 1, 1, 0
11405 // For S lanes: 1, 1, 1, 0, 0
11406 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
11407 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
11408 Initialise(&masm, p0.VnB(), pg_in);
11409
11410 // Make a copy so we can check that constructive operations preserve zn.
11411 __ Mov(z0, z31);
11412 __ Saddv(b0, p0, z0.VnB()); // destructive
11413 __ Saddv(h1, p0, z31.VnH());
11414 __ Mov(z2, z31);
11415 __ Saddv(s2, p0, z2.VnS()); // destructive
11416
11417 __ Uaddv(b4, p0, z31.VnB());
11418 __ Mov(z5, z31);
11419 __ Uaddv(h5, p0, z5.VnH()); // destructive
11420 __ Uaddv(s6, p0, z31.VnS());
11421 __ Mov(z7, z31);
11422 __ Uaddv(d7, p0, z7.VnD()); // destructive
11423
11424 END();
11425
11426 if (CAN_RUN()) {
11427 RUN();
11428
11429 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11430 // Saddv
11431 ASSERT_EQUAL_64(0xfffffffffffffda9, d0);
11432 ASSERT_EQUAL_64(0xfffffffffffe9495, d1);
11433 ASSERT_EQUAL_64(0xffffffff07090b0c, d2);
11434 // Uaddv
11435 ASSERT_EQUAL_64(0x00000000000002a9, d4);
11436 ASSERT_EQUAL_64(0x0000000000019495, d5);
11437 ASSERT_EQUAL_64(0x0000000107090b0c, d6);
11438 ASSERT_EQUAL_64(0x8182838485868788, d7);
11439 } else {
11440 // Saddv
11441 ASSERT_EQUAL_64(0xfffffffffffffd62, d0);
11442 ASSERT_EQUAL_64(0xfffffffffffe8394, d1);
11443 ASSERT_EQUAL_64(0xfffffffed3e6fa0b, d2);
11444 // Uaddv
11445 ASSERT_EQUAL_64(0x0000000000000562, d4);
11446 ASSERT_EQUAL_64(0x0000000000028394, d5);
11447 ASSERT_EQUAL_64(0x00000001d3e6fa0b, d6);
11448 ASSERT_EQUAL_64(0x0a1c2e4052647687, d7);
11449 }
11450
11451 // Check the upper lanes above the top of the V register are all clear.
11452 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11453 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11454 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11455 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11456 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11457 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11458 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11459 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11460 }
11461 }
11462}
11463
11464
11465TEST_SVE(sve_sminv_uminv) {
11466 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11467 START();
11468
11469 uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11470 InsrHelper(&masm, z31.VnD(), in);
11471
11472 // For simplicity, we re-use the same pg for various lane sizes.
11473 // For D lanes: 1, 0, 1
11474 // For S lanes: 1, 1, 0, 0, 1
11475 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
11476 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11477 Initialise(&masm, p0.VnB(), pg_in);
11478
11479 // Make a copy so we can check that constructive operations preserve zn.
11480 __ Mov(z0, z31);
11481 __ Sminv(b0, p0, z0.VnB()); // destructive
11482 __ Sminv(h1, p0, z31.VnH());
11483 __ Mov(z2, z31);
11484 __ Sminv(s2, p0, z2.VnS()); // destructive
11485 __ Sminv(d3, p0, z31.VnD());
11486
11487 __ Uminv(b4, p0, z31.VnB());
11488 __ Mov(z5, z31);
11489 __ Uminv(h5, p0, z5.VnH()); // destructive
11490 __ Uminv(s6, p0, z31.VnS());
11491 __ Mov(z7, z31);
11492 __ Uminv(d7, p0, z7.VnD()); // destructive
11493
11494 END();
11495
11496 if (CAN_RUN()) {
11497 RUN();
11498
11499 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11500 // Sminv
11501 ASSERT_EQUAL_64(0xaa, d0);
11502 ASSERT_EQUAL_64(0xaabb, d1);
11503 ASSERT_EQUAL_64(0xaabbfc00, d2);
11504 ASSERT_EQUAL_64(0x00112233aabbfc00, d3); // The smaller lane is inactive.
11505 // Uminv
11506 ASSERT_EQUAL_64(0, d4);
11507 ASSERT_EQUAL_64(0x2233, d5);
11508 ASSERT_EQUAL_64(0x112233, d6);
11509 ASSERT_EQUAL_64(0x00112233aabbfc00, d7); // The smaller lane is inactive.
11510 } else {
11511 // Sminv
11512 ASSERT_EQUAL_64(0xaa, d0);
11513 ASSERT_EQUAL_64(0xaaaa, d1);
11514 ASSERT_EQUAL_64(0xaaaaaaaa, d2);
11515 ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d3);
11516 // Uminv
11517 ASSERT_EQUAL_64(0, d4);
11518 ASSERT_EQUAL_64(0x2233, d5);
11519 ASSERT_EQUAL_64(0x112233, d6);
11520 ASSERT_EQUAL_64(0x00112233aabbfc00, d7);
11521 }
11522
11523 // Check the upper lanes above the top of the V register are all clear.
11524 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11525 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11526 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11527 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11528 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11529 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11530 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11531 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11532 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11533 }
11534 }
11535}
11536
11537TEST_SVE(sve_smaxv_umaxv) {
11538 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11539 START();
11540
11541 uint64_t in[] = {0xfffa5555aaaaaaaa, 0x0011223344aafe80, 0x00112233aabbfc00};
11542 InsrHelper(&masm, z31.VnD(), in);
11543
11544 // For simplicity, we re-use the same pg for various lane sizes.
11545 // For D lanes: 1, 0, 1
11546 // For S lanes: 1, 1, 0, 0, 1
11547 // For H lanes: 1, 1, 0, 1, 1, 0, 0, 0, 1, 1
11548 int pg_in[] = {1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1};
11549 Initialise(&masm, p0.VnB(), pg_in);
11550
11551 // Make a copy so we can check that constructive operations preserve zn.
11552 __ Mov(z0, z31);
11553 __ Smaxv(b0, p0, z0.VnB()); // destructive
11554 __ Smaxv(h1, p0, z31.VnH());
11555 __ Mov(z2, z31);
11556 __ Smaxv(s2, p0, z2.VnS()); // destructive
11557 __ Smaxv(d3, p0, z31.VnD());
11558
11559 __ Umaxv(b4, p0, z31.VnB());
11560 __ Mov(z5, z31);
11561 __ Umaxv(h5, p0, z5.VnH()); // destructive
11562 __ Umaxv(s6, p0, z31.VnS());
11563 __ Mov(z7, z31);
11564 __ Umaxv(d7, p0, z7.VnD()); // destructive
11565
11566 END();
11567
11568 if (CAN_RUN()) {
11569 RUN();
11570
11571 if (static_cast<int>(ArrayLength(pg_in)) >= config->sve_vl_in_bytes()) {
11572 // Smaxv
11573 ASSERT_EQUAL_64(0x33, d0);
11574 ASSERT_EQUAL_64(0x44aa, d1);
11575 ASSERT_EQUAL_64(0x112233, d2);
11576 ASSERT_EQUAL_64(0x112233aabbfc00, d3);
11577 // Umaxv
11578 ASSERT_EQUAL_64(0xfe, d4);
11579 ASSERT_EQUAL_64(0xfc00, d5);
11580 ASSERT_EQUAL_64(0xaabbfc00, d6);
11581 ASSERT_EQUAL_64(0x112233aabbfc00, d7);
11582 } else {
11583 // Smaxv
11584 ASSERT_EQUAL_64(0x33, d0);
11585 ASSERT_EQUAL_64(0x44aa, d1);
11586 ASSERT_EQUAL_64(0x112233, d2);
11587 ASSERT_EQUAL_64(0x00112233aabbfc00, d3);
11588 // Umaxv
11589 ASSERT_EQUAL_64(0xfe, d4);
11590 ASSERT_EQUAL_64(0xfc00, d5);
11591 ASSERT_EQUAL_64(0xaabbfc00, d6);
11592 ASSERT_EQUAL_64(0xfffa5555aaaaaaaa, d7);
11593 }
11594
11595 // Check the upper lanes above the top of the V register are all clear.
11596 for (int i = 1; i < core.GetSVELaneCount(kDRegSize); i++) {
11597 ASSERT_EQUAL_SVE_LANE(0, z0.VnD(), i);
11598 ASSERT_EQUAL_SVE_LANE(0, z1.VnD(), i);
11599 ASSERT_EQUAL_SVE_LANE(0, z2.VnD(), i);
11600 ASSERT_EQUAL_SVE_LANE(0, z3.VnD(), i);
11601 ASSERT_EQUAL_SVE_LANE(0, z4.VnD(), i);
11602 ASSERT_EQUAL_SVE_LANE(0, z5.VnD(), i);
11603 ASSERT_EQUAL_SVE_LANE(0, z6.VnD(), i);
11604 ASSERT_EQUAL_SVE_LANE(0, z7.VnD(), i);
11605 }
11606 }
11607}
11608
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011609template <typename T, size_t M, size_t N>
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011610static void SdotUdotHelper(Test* config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011611 unsigned lane_size_in_bits,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011612 const T (&zd_inputs)[M],
11613 const T (&za_inputs)[M],
11614 const T (&zn_inputs)[N],
11615 const T (&zm_inputs)[N],
11616 const T (&zd_expected)[M],
11617 const T (&zdnm_expected)[M],
11618 bool is_signed,
11619 int index = -1) {
11620 VIXL_STATIC_ASSERT(N == (M * 4));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011621 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11622 START();
11623
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011624 auto dot_fn = [&](const ZRegister& zd,
11625 const ZRegister& za,
11626 const ZRegister& zn,
11627 const ZRegister& zm,
Martyn Capewell6e8db232022-01-07 16:38:14 +000011628 bool is_signed_fn,
11629 int index_fn) {
11630 if (is_signed_fn) {
11631 if (index_fn < 0) {
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011632 __ Sdot(zd, za, zn, zm);
11633 } else {
Martyn Capewell6e8db232022-01-07 16:38:14 +000011634 __ Sdot(zd, za, zn, zm, index_fn);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011635 }
11636 } else {
Martyn Capewell6e8db232022-01-07 16:38:14 +000011637 if (index_fn < 0) {
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011638 __ Udot(zd, za, zn, zm);
11639 } else {
Martyn Capewell6e8db232022-01-07 16:38:14 +000011640 __ Udot(zd, za, zn, zm, index_fn);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011641 }
11642 }
11643 };
11644
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011645 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
11646 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
11647 ZRegister zn = z2.WithLaneSize(lane_size_in_bits / 4);
11648 ZRegister zm = z3.WithLaneSize(lane_size_in_bits / 4);
11649
11650 InsrHelper(&masm, zd, zd_inputs);
11651 InsrHelper(&masm, za, za_inputs);
11652 InsrHelper(&masm, zn, zn_inputs);
11653 InsrHelper(&masm, zm, zm_inputs);
11654
11655 // The Dot macro handles arbitrarily-aliased registers in the argument list.
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011656 ZRegister dm_result = z4.WithLaneSize(lane_size_in_bits);
11657 ZRegister dnm_result = z5.WithLaneSize(lane_size_in_bits);
11658 ZRegister da_result = z6.WithLaneSize(lane_size_in_bits);
11659 ZRegister dn_result = z7.WithLaneSize(lane_size_in_bits);
11660 ZRegister d_result = z8.WithLaneSize(lane_size_in_bits);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011661
11662 __ Mov(da_result, za);
11663 // zda = zda + (zn . zm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011664 dot_fn(da_result, da_result, zn, zm, is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011665
TatWai Chong50ef1712020-06-19 05:47:44 -070011666 __ Mov(dn_result, zn.WithSameLaneSizeAs(dn_result));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011667 // zdn = za + (zdn . zm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011668 dot_fn(dn_result, za, dn_result.WithSameLaneSizeAs(zn), zm, is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011669
TatWai Chong50ef1712020-06-19 05:47:44 -070011670 __ Mov(dm_result, zm.WithSameLaneSizeAs(dm_result));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011671 // zdm = za + (zn . zdm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011672 dot_fn(dm_result, za, zn, dm_result.WithSameLaneSizeAs(zm), is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011673
11674 __ Mov(d_result, zd);
11675 // zd = za + (zn . zm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011676 dot_fn(d_result, za, zn, zm, is_signed, index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011677
TatWai Chong50ef1712020-06-19 05:47:44 -070011678 __ Mov(dnm_result, zn.WithSameLaneSizeAs(dnm_result));
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011679 // zdnm = za + (zdmn . zdnm)
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011680 dot_fn(dnm_result,
11681 za,
11682 dnm_result.WithSameLaneSizeAs(zn),
11683 dnm_result.WithSameLaneSizeAs(zm),
11684 is_signed,
11685 index);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011686
11687 END();
11688
11689 if (CAN_RUN()) {
11690 RUN();
11691
11692 ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
11693 ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits / 4));
11694 ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits / 4));
11695
11696 ASSERT_EQUAL_SVE(zd_expected, da_result);
11697 ASSERT_EQUAL_SVE(zd_expected, dn_result);
11698 ASSERT_EQUAL_SVE(zd_expected, dm_result);
11699 ASSERT_EQUAL_SVE(zd_expected, d_result);
11700
11701 ASSERT_EQUAL_SVE(zdnm_expected, dnm_result);
11702 }
11703}
11704
11705TEST_SVE(sve_sdot) {
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011706 int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11707 int64_t za_inputs[] = {INT32_MAX, -3, 2};
11708 int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11709 int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011710
11711 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011712 int64_t zd_expected_s[] = {-2147418113, -183, 133}; // 0x8000ffff
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011713 int64_t zd_expected_d[] = {2147549183, -183, 133}; // 0x8000ffff
11714
11715 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011716 int64_t zdnm_expected_s[] = {-2147418113, 980, 572};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011717 int64_t zdnm_expected_d[] = {2147549183, 980, 572};
11718
11719 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011720 kSRegSize,
11721 zd_inputs,
11722 za_inputs,
11723 zn_inputs,
11724 zm_inputs,
11725 zd_expected_s,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011726 zdnm_expected_s,
11727 true);
11728
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011729 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011730 kDRegSize,
11731 zd_inputs,
11732 za_inputs,
11733 zn_inputs,
11734 zm_inputs,
11735 zd_expected_d,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011736 zdnm_expected_d,
11737 true);
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011738}
11739
11740TEST_SVE(sve_udot) {
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011741 int64_t zd_inputs[] = {0x33, 0xee, 0xff};
11742 int64_t za_inputs[] = {INT32_MAX, -3, 2};
11743 int64_t zn_inputs[] = {-128, -128, -128, -128, 9, -1, 1, 30, -5, -20, 9, 8};
11744 int64_t zm_inputs[] = {-128, -128, -128, -128, -19, 15, 6, 0, 9, -5, 4, 5};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011745
11746 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011747 int64_t zd_expected_s[] = {0x8000ffff, 0x00001749, 0x0000f085};
11748 int64_t zd_expected_d[] = {0x000000047c00ffff,
11749 0x000000000017ff49,
11750 0x00000000fff00085};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011751
11752 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011753 int64_t zdnm_expected_s[] = {0x8000ffff, 0x000101d4, 0x0001d03c};
11754 int64_t zdnm_expected_d[] = {0x000000047c00ffff,
11755 0x00000000fffe03d4,
11756 0x00000001ffce023c};
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011757
11758 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011759 kSRegSize,
11760 zd_inputs,
11761 za_inputs,
11762 zn_inputs,
11763 zm_inputs,
11764 zd_expected_s,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011765 zdnm_expected_s,
11766 false);
11767
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011768 SdotUdotHelper(config,
TatWai Chong4d2a4e92019-10-23 16:19:32 -070011769 kDRegSize,
11770 zd_inputs,
11771 za_inputs,
11772 zn_inputs,
11773 zm_inputs,
11774 zd_expected_d,
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070011775 zdnm_expected_d,
11776 false);
11777}
11778
11779TEST_SVE(sve_sdot_indexed_s) {
11780 int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11781 int64_t za_inputs[] = {0, 1, 2, 3};
11782 int64_t zn_inputs[] =
11783 {-1, -1, -1, -1, -2, -2, -2, -2, -3, -3, -3, -3, -4, -4, -4, -4};
11784 int64_t zm_inputs[] =
11785 {127, 127, 127, 127, -128, -128, -128, -128, -1, -1, -1, -1, 0, 0, 0, 0};
11786
11787 constexpr int s = kQRegSize / kSRegSize;
11788
11789 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11790 int64_t zd_expected_s[][s] = {{0, 1, 2, 3}, // Generated from zm[0]
11791 {4, 9, 14, 19},
11792 {512, 1025, 1538, 2051},
11793 {-508, -1015, -1522, -2029}};
11794
11795 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11796 int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11797 {12, 25, 38, 51},
11798 {8, 17, 26, 35},
11799 {4, 9, 14, 19}};
11800
11801 for (unsigned i = 0; i < s; i++) {
11802 SdotUdotHelper(config,
11803 kSRegSize,
11804 zd_inputs,
11805 za_inputs,
11806 zn_inputs,
11807 zm_inputs,
11808 zd_expected_s[i],
11809 zdnm_expected_s[i],
11810 true,
11811 i);
11812 }
11813}
11814
11815TEST_SVE(sve_sdot_indexed_d) {
11816 int64_t zd_inputs[] = {0xff, 0xff};
11817 int64_t za_inputs[] = {0, 1};
11818 int64_t zn_inputs[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11819 int64_t zm_inputs[] = {-128, -128, -128, -128, 127, 127, 127, 127};
11820
11821 constexpr int d = kQRegSize / kDRegSize;
11822
11823 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11824 int64_t zd_expected_d[][d] = {{-508, -507}, // Generated from zm[0]
11825 {512, 513}};
11826
11827 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11828 int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11829
11830 for (unsigned i = 0; i < d; i++) {
11831 SdotUdotHelper(config,
11832 kDRegSize,
11833 zd_inputs,
11834 za_inputs,
11835 zn_inputs,
11836 zm_inputs,
11837 zd_expected_d[i],
11838 zdnm_expected_d[i],
11839 true,
11840 i);
11841 }
11842}
11843
11844TEST_SVE(sve_udot_indexed_s) {
11845 int64_t zd_inputs[] = {0xff, 0xff, 0xff, 0xff};
11846 int64_t za_inputs[] = {0, 1, 2, 3};
11847 int64_t zn_inputs[] = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4};
11848 int64_t zm_inputs[] =
11849 {127, 127, 127, 127, 255, 255, 255, 255, 1, 1, 1, 1, 0, 0, 0, 0};
11850
11851 constexpr int s = kQRegSize / kSRegSize;
11852
11853 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11854 int64_t zd_expected_s[][s] = {{0, 1, 2, 3},
11855 {4, 9, 14, 19},
11856 {1020, 2041, 3062, 4083},
11857 {508, 1017, 1526, 2035}};
11858
11859 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11860 int64_t zdnm_expected_s[][s] = {{16, 33, 50, 67},
11861 {12, 25, 38, 51},
11862 {8, 17, 26, 35},
11863 {4, 9, 14, 19}};
11864
11865 for (unsigned i = 0; i < s; i++) {
11866 SdotUdotHelper(config,
11867 kSRegSize,
11868 zd_inputs,
11869 za_inputs,
11870 zn_inputs,
11871 zm_inputs,
11872 zd_expected_s[i],
11873 zdnm_expected_s[i],
11874 false,
11875 i);
11876 }
11877}
11878
11879TEST_SVE(sve_udot_indexed_d) {
11880 int64_t zd_inputs[] = {0xff, 0xff};
11881 int64_t za_inputs[] = {0, 1};
11882 int64_t zn_inputs[] = {1, 1, 1, 1, 1, 1, 1, 1};
11883 int64_t zm_inputs[] = {255, 255, 255, 255, 127, 127, 127, 127};
11884
11885 constexpr int d = kQRegSize / kDRegSize;
11886
11887 // zd_expected[] = za_inputs[] + (zn_inputs[] . zm_inputs[])
11888 int64_t zd_expected_d[][d] = {{508, 509}, {1020, 1021}};
11889
11890 // zdnm_expected[] = za_inputs[] + (zn_inputs[] . zn_inputs[])
11891 int64_t zdnm_expected_d[][d] = {{4, 5}, {4, 5}};
11892
11893 for (unsigned i = 0; i < d; i++) {
11894 SdotUdotHelper(config,
11895 kDRegSize,
11896 zd_inputs,
11897 za_inputs,
11898 zn_inputs,
11899 zm_inputs,
11900 zd_expected_d[i],
11901 zdnm_expected_d[i],
11902 false,
11903 i);
11904 }
11905}
11906
11907static void IntSegmentPatternHelper(MacroAssembler* masm,
11908 const ZRegister& dst,
11909 const ZRegister& src) {
11910 VIXL_ASSERT(AreSameLaneSize(dst, src));
11911 UseScratchRegisterScope temps(masm);
11912 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
11913 masm->Index(ztmp, 0, 1);
11914 masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
11915 masm->Add(dst, src, ztmp);
11916}
11917
11918TEST_SVE(sve_sdot_udot_indexed_s) {
11919 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
11920 START();
11921
11922 const int multiplier = 2;
11923 __ Dup(z9.VnS(), multiplier);
11924
11925 __ Ptrue(p0.VnB());
11926 __ Index(z29.VnS(), 4, 1);
11927
11928 // z29 = [... 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0]
11929 __ And(z29.VnS(), z29.VnS(), 3);
11930
11931 // p7 = [... 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
11932 __ Cmple(p7.VnS(), p0.Zeroing(), z29.VnS(), 0);
11933
11934 // p6 = [... 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
11935 __ Cmple(p6.VnS(), p0.Zeroing(), z29.VnS(), 1);
11936
11937 // p5 = [... 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]
11938 __ Cmple(p5.VnS(), p0.Zeroing(), z29.VnS(), 2);
11939
11940 __ Index(z28.VnB(), 1, 1);
11941 __ Dup(z27.VnS(), z28.VnS(), 0);
11942
11943 // z27 = [... 3, 2, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1]
11944 IntSegmentPatternHelper(&masm, z27.VnB(), z27.VnB());
11945
11946 // z27 = [... 6, 4, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2]
11947 __ Mul(z27.VnS(), p7.Merging(), z27.VnS(), z9.VnS());
11948
11949 // z27 = [... 12, 8, 4, 3, 2, 1, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4]
11950 __ Mul(z27.VnS(), p6.Merging(), z27.VnS(), z9.VnS());
11951
11952 // 2nd segment | 1st segment |
11953 // v v
11954 // z27 = [... 24, 16, 4, 3, 2, 1, 8, 6, 4, 2, 16, 12, 8, 4, 32, 24, 16, 8]
11955 __ Mul(z27.VnS(), p5.Merging(), z27.VnS(), z9.VnS());
11956
11957 __ Dup(z0.VnS(), 0);
11958 __ Dup(z1.VnS(), 0);
11959 __ Dup(z2.VnS(), 0);
11960 __ Dup(z3.VnS(), 0);
11961 __ Dup(z4.VnS(), 0);
11962 __ Dup(z5.VnS(), 0);
11963
11964 // Skip the lanes starting from the 129th lane since the value of these lanes
11965 // are overflow after the number sequence creation by `index`.
11966 __ Cmpls(p3.VnB(), p0.Zeroing(), z28.VnB(), 128);
11967 __ Mov(z0.VnB(), p3.Merging(), z27.VnB());
11968 __ Mov(z1.VnB(), p3.Merging(), z28.VnB());
11969
11970 __ Dup(z2.VnS(), 0);
11971 __ Dup(z3.VnS(), 0);
11972 __ Dup(z4.VnS(), 0);
11973 __ Dup(z5.VnS(), 0);
11974
11975 __ Udot(z2.VnS(), z2.VnS(), z1.VnB(), z0.VnB(), 0);
11976
11977 __ Udot(z3.VnS(), z3.VnS(), z1.VnB(), z0.VnB(), 1);
11978 __ Mul(z3.VnS(), z3.VnS(), 2);
11979
11980 __ Udot(z4.VnS(), z4.VnS(), z1.VnB(), z0.VnB(), 2);
11981 __ Mul(z4.VnS(), z4.VnS(), 4);
11982
11983 __ Udot(z5.VnS(), z5.VnS(), z1.VnB(), z0.VnB(), 3);
11984 __ Mul(z5.VnS(), z5.VnS(), 8);
11985
11986 __ Dup(z7.VnS(), 0);
11987 __ Dup(z8.VnS(), 0);
11988 __ Dup(z9.VnS(), 0);
11989 __ Dup(z10.VnS(), 0);
11990
11991 // Negate the all positive vector for testing signed dot.
11992 __ Neg(z6.VnB(), p0.Merging(), z0.VnB());
11993 __ Sdot(z7.VnS(), z7.VnS(), z1.VnB(), z6.VnB(), 0);
11994
11995 __ Sdot(z8.VnS(), z8.VnS(), z1.VnB(), z6.VnB(), 1);
11996 __ Mul(z8.VnS(), z8.VnS(), 2);
11997
11998 __ Sdot(z9.VnS(), z9.VnS(), z1.VnB(), z6.VnB(), 2);
11999 __ Mul(z9.VnS(), z9.VnS(), 4);
12000
12001 __ Sdot(z10.VnS(), z10.VnS(), z1.VnB(), z6.VnB(), 3);
12002 __ Mul(z10.VnS(), z10.VnS(), 8);
12003
12004 END();
12005
12006 if (CAN_RUN()) {
12007 RUN();
12008
12009 // Only compare the first 128-bit segment of destination register, use
12010 // another result from generated instructions to check the remaining part.
12011 // s_lane[0] = (1 * 8) + (2 * 16) + (3 * 24) + (4 * 32) = 240
12012 // ...
12013 // s_lane[3] = (13 * 8) + (14 * 16) + (15 * 24) + (16 * 32) = 1200
12014 int udot_expected[] = {1200, 880, 560, 240};
12015 ASSERT_EQUAL_SVE(udot_expected, z2.VnS());
12016 ASSERT_EQUAL_SVE(z2.VnS(), z3.VnS());
12017 ASSERT_EQUAL_SVE(z2.VnS(), z4.VnS());
12018 ASSERT_EQUAL_SVE(z2.VnS(), z5.VnS());
12019
12020 int sdot_expected[] = {-1200, -880, -560, -240};
12021 ASSERT_EQUAL_SVE(sdot_expected, z7.VnS());
12022 ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
12023 ASSERT_EQUAL_SVE(z7.VnS(), z9.VnS());
12024 ASSERT_EQUAL_SVE(z7.VnS(), z10.VnS());
12025 }
12026}
12027
12028TEST_SVE(sve_sdot_udot_indexed_d) {
12029 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12030 START();
12031
12032 const int multiplier = 2;
12033 __ Dup(z9.VnD(), multiplier);
12034
12035 __ Ptrue(p0.VnD());
12036 __ Pfalse(p1.VnD());
12037
12038 // p2 = [..., 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
12039 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
12040
12041 __ Index(z1.VnH(), 1, 1);
12042 __ Dup(z0.VnD(), z1.VnD(), 0);
12043
12044 // z0 = [... 5, 4, 3, 2, 5, 4, 3, 2, 4, 3, 2, 1, 4, 3, 2, 1]
12045 IntSegmentPatternHelper(&masm, z0.VnH(), z0.VnH());
12046
12047 // 2nd segment | 1st segment |
12048 // v v
12049 // z0 = [... 5, 4, 3, 2, 10, 8, 6, 4, 4, 3, 2, 1, 8, 6, 4, 2]
12050 __ Mul(z0.VnD(), p2.Merging(), z0.VnD(), z9.VnD());
12051
12052 __ Dup(z3.VnD(), 0);
12053 __ Dup(z4.VnD(), 0);
12054
12055 __ Udot(z3.VnD(), z3.VnD(), z1.VnH(), z0.VnH(), 0);
12056
12057 __ Udot(z4.VnD(), z4.VnD(), z1.VnH(), z0.VnH(), 1);
12058 __ Mul(z4.VnD(), z4.VnD(), multiplier);
12059
12060 __ Dup(z12.VnD(), 0);
12061 __ Dup(z13.VnD(), 0);
12062
12063 __ Ptrue(p4.VnH());
12064 __ Neg(z10.VnH(), p4.Merging(), z0.VnH());
12065
12066 __ Sdot(z12.VnD(), z12.VnD(), z1.VnH(), z10.VnH(), 0);
12067
12068 __ Sdot(z13.VnD(), z13.VnD(), z1.VnH(), z10.VnH(), 1);
12069 __ Mul(z13.VnD(), z13.VnD(), multiplier);
12070
12071 END();
12072
12073 if (CAN_RUN()) {
12074 RUN();
12075
12076 // Only compare the first 128-bit segment of destination register, use
12077 // another result from generated instructions to check the remaining part.
12078 // d_lane[0] = (1 * 2) + (2 * 4) + (3 * 6) + (4 * 8) = 60
12079 // d_lane[1] = (5 * 2) + (6 * 4) + (7 * 6) + (8 * 8) = 140
12080 uint64_t udot_expected[] = {416, 304, 140, 60};
12081 ASSERT_EQUAL_SVE(udot_expected, z3.VnD());
12082 ASSERT_EQUAL_SVE(z3.VnD(), z4.VnD());
12083
12084 int64_t sdot_expected[] = {-416, -304, -140, -60};
12085 ASSERT_EQUAL_SVE(sdot_expected, z12.VnD());
12086 ASSERT_EQUAL_SVE(z12.VnD(), z13.VnD());
12087 }
TatWai Chong4d2a4e92019-10-23 16:19:32 -070012088}
12089
TatWai Chong7a0d3672019-10-23 17:35:18 -070012090template <typename T, size_t N>
12091static void FPToRawbitsWithSize(const T (&inputs)[N],
12092 uint64_t* outputs,
12093 unsigned size_in_bits) {
TatWai Chongfe536042019-10-23 16:34:11 -070012094 for (size_t i = 0; i < N; i++) {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012095 outputs[i] = vixl::FPToRawbitsWithSize(size_in_bits, inputs[i]);
TatWai Chongfe536042019-10-23 16:34:11 -070012096 }
12097}
12098
TatWai Chong7a0d3672019-10-23 17:35:18 -070012099template <typename Ti, typename Te, size_t N>
12100static void FPBinArithHelper(Test* config,
12101 ArithFn macro,
12102 int lane_size_in_bits,
12103 const Ti (&zn_inputs)[N],
12104 const Ti (&zm_inputs)[N],
12105 const Te (&zd_expected)[N]) {
TatWai Chongfe536042019-10-23 16:34:11 -070012106 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
TatWai Chong7a0d3672019-10-23 17:35:18 -070012107
TatWai Chongfe536042019-10-23 16:34:11 -070012108 START();
12109
12110 ZRegister zd = z29.WithLaneSize(lane_size_in_bits);
12111 ZRegister zn = z30.WithLaneSize(lane_size_in_bits);
12112 ZRegister zm = z31.WithLaneSize(lane_size_in_bits);
12113
12114 uint64_t zn_rawbits[N];
12115 uint64_t zm_rawbits[N];
12116
TatWai Chong7a0d3672019-10-23 17:35:18 -070012117 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
12118 FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
TatWai Chongfe536042019-10-23 16:34:11 -070012119
12120 InsrHelper(&masm, zn, zn_rawbits);
12121 InsrHelper(&masm, zm, zm_rawbits);
12122
12123 (masm.*macro)(zd, zn, zm);
12124
12125 END();
12126
12127 if (CAN_RUN()) {
12128 RUN();
12129
12130 ASSERT_EQUAL_SVE(zd_expected, zd);
12131 }
12132}
12133
12134TEST_SVE(sve_fp_arithmetic_unpredicated_fadd) {
12135 double zn_inputs[] = {24.0,
12136 5.5,
12137 0.0,
12138 3.875,
12139 2.125,
12140 kFP64PositiveInfinity,
12141 kFP64NegativeInfinity};
12142
12143 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12144
TatWai Chong7a0d3672019-10-23 17:35:18 -070012145 ArithFn fn = &MacroAssembler::Fadd;
TatWai Chongfe536042019-10-23 16:34:11 -070012146
12147 uint16_t expected_h[] = {Float16ToRawbits(Float16(1048.0)),
12148 Float16ToRawbits(Float16(2053.5)),
12149 Float16ToRawbits(Float16(0.1)),
12150 Float16ToRawbits(Float16(-0.875)),
12151 Float16ToRawbits(Float16(14.465)),
12152 Float16ToRawbits(kFP16PositiveInfinity),
12153 Float16ToRawbits(kFP16NegativeInfinity)};
12154
TatWai Chong7a0d3672019-10-23 17:35:18 -070012155 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
TatWai Chongfe536042019-10-23 16:34:11 -070012156
12157 uint32_t expected_s[] = {FloatToRawbits(1048.0f),
12158 FloatToRawbits(2053.5f),
12159 FloatToRawbits(0.1f),
12160 FloatToRawbits(-0.875f),
12161 FloatToRawbits(14.465f),
12162 FloatToRawbits(kFP32PositiveInfinity),
12163 FloatToRawbits(kFP32NegativeInfinity)};
12164
TatWai Chong7a0d3672019-10-23 17:35:18 -070012165 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
TatWai Chongfe536042019-10-23 16:34:11 -070012166
12167 uint64_t expected_d[] = {DoubleToRawbits(1048.0),
12168 DoubleToRawbits(2053.5),
12169 DoubleToRawbits(0.1),
12170 DoubleToRawbits(-0.875),
12171 DoubleToRawbits(14.465),
12172 DoubleToRawbits(kFP64PositiveInfinity),
12173 DoubleToRawbits(kFP64NegativeInfinity)};
12174
TatWai Chong7a0d3672019-10-23 17:35:18 -070012175 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
TatWai Chongfe536042019-10-23 16:34:11 -070012176}
12177
12178TEST_SVE(sve_fp_arithmetic_unpredicated_fsub) {
12179 double zn_inputs[] = {24.0,
12180 5.5,
12181 0.0,
12182 3.875,
12183 2.125,
12184 kFP64PositiveInfinity,
12185 kFP64NegativeInfinity};
12186
12187 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12188
TatWai Chong7a0d3672019-10-23 17:35:18 -070012189 ArithFn fn = &MacroAssembler::Fsub;
TatWai Chongfe536042019-10-23 16:34:11 -070012190
12191 uint16_t expected_h[] = {Float16ToRawbits(Float16(-1000.0)),
12192 Float16ToRawbits(Float16(-2042.5)),
12193 Float16ToRawbits(Float16(-0.1)),
12194 Float16ToRawbits(Float16(8.625)),
12195 Float16ToRawbits(Float16(-10.215)),
12196 Float16ToRawbits(kFP16PositiveInfinity),
12197 Float16ToRawbits(kFP16NegativeInfinity)};
12198
TatWai Chong7a0d3672019-10-23 17:35:18 -070012199 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
TatWai Chongfe536042019-10-23 16:34:11 -070012200
12201 uint32_t expected_s[] = {FloatToRawbits(-1000.0),
12202 FloatToRawbits(-2042.5),
12203 FloatToRawbits(-0.1),
12204 FloatToRawbits(8.625),
12205 FloatToRawbits(-10.215),
12206 FloatToRawbits(kFP32PositiveInfinity),
12207 FloatToRawbits(kFP32NegativeInfinity)};
12208
TatWai Chong7a0d3672019-10-23 17:35:18 -070012209 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
TatWai Chongfe536042019-10-23 16:34:11 -070012210
12211 uint64_t expected_d[] = {DoubleToRawbits(-1000.0),
12212 DoubleToRawbits(-2042.5),
12213 DoubleToRawbits(-0.1),
12214 DoubleToRawbits(8.625),
12215 DoubleToRawbits(-10.215),
12216 DoubleToRawbits(kFP64PositiveInfinity),
12217 DoubleToRawbits(kFP64NegativeInfinity)};
12218
TatWai Chong7a0d3672019-10-23 17:35:18 -070012219 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
TatWai Chongfe536042019-10-23 16:34:11 -070012220}
12221
12222TEST_SVE(sve_fp_arithmetic_unpredicated_fmul) {
12223 double zn_inputs[] = {24.0,
12224 5.5,
12225 0.0,
12226 3.875,
12227 2.125,
12228 kFP64PositiveInfinity,
12229 kFP64NegativeInfinity};
12230
12231 double zm_inputs[] = {1024.0, 2048.0, 0.1, -4.75, 12.34, 255.0, -13.0};
12232
TatWai Chong7a0d3672019-10-23 17:35:18 -070012233 ArithFn fn = &MacroAssembler::Fmul;
TatWai Chongfe536042019-10-23 16:34:11 -070012234
12235 uint16_t expected_h[] = {Float16ToRawbits(Float16(24576.0)),
12236 Float16ToRawbits(Float16(11264.0)),
12237 Float16ToRawbits(Float16(0.0)),
12238 Float16ToRawbits(Float16(-18.4)),
12239 Float16ToRawbits(Float16(26.23)),
12240 Float16ToRawbits(kFP16PositiveInfinity),
12241 Float16ToRawbits(kFP16PositiveInfinity)};
12242
TatWai Chong7a0d3672019-10-23 17:35:18 -070012243 FPBinArithHelper(config, fn, kHRegSize, zn_inputs, zm_inputs, expected_h);
TatWai Chongfe536042019-10-23 16:34:11 -070012244
12245 uint32_t expected_s[] = {FloatToRawbits(24576.0),
12246 FloatToRawbits(11264.0),
12247 FloatToRawbits(0.0),
12248 FloatToRawbits(-18.40625),
12249 FloatToRawbits(26.2225),
12250 FloatToRawbits(kFP32PositiveInfinity),
12251 FloatToRawbits(kFP32PositiveInfinity)};
12252
TatWai Chong7a0d3672019-10-23 17:35:18 -070012253 FPBinArithHelper(config, fn, kSRegSize, zn_inputs, zm_inputs, expected_s);
TatWai Chongfe536042019-10-23 16:34:11 -070012254
12255 uint64_t expected_d[] = {DoubleToRawbits(24576.0),
12256 DoubleToRawbits(11264.0),
12257 DoubleToRawbits(0.0),
12258 DoubleToRawbits(-18.40625),
12259 DoubleToRawbits(26.2225),
12260 DoubleToRawbits(kFP64PositiveInfinity),
12261 DoubleToRawbits(kFP64PositiveInfinity)};
12262
TatWai Chong7a0d3672019-10-23 17:35:18 -070012263 FPBinArithHelper(config, fn, kDRegSize, zn_inputs, zm_inputs, expected_d);
TatWai Chongfe536042019-10-23 16:34:11 -070012264}
12265
TatWai Chong7a0d3672019-10-23 17:35:18 -070012266typedef void (MacroAssembler::*FPArithPredicatedFn)(
12267 const ZRegister& zd,
12268 const PRegisterM& pg,
12269 const ZRegister& zn,
12270 const ZRegister& zm,
12271 FPMacroNaNPropagationOption nan_option);
12272
Martyn Capewell37f28182020-01-14 10:15:10 +000012273typedef void (MacroAssembler::*FPArithPredicatedNoNaNOptFn)(
12274 const ZRegister& zd,
12275 const PRegisterM& pg,
12276 const ZRegister& zn,
12277 const ZRegister& zm);
12278
TatWai Chong7a0d3672019-10-23 17:35:18 -070012279template <typename Ti, typename Te, size_t N>
12280static void FPBinArithHelper(
12281 Test* config,
12282 FPArithPredicatedFn macro,
Martyn Capewell37f28182020-01-14 10:15:10 +000012283 FPArithPredicatedNoNaNOptFn macro_nonan,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012284 unsigned lane_size_in_bits,
12285 const Ti (&zd_inputs)[N],
12286 const int (&pg_inputs)[N],
12287 const Ti (&zn_inputs)[N],
12288 const Ti (&zm_inputs)[N],
12289 const Te (&zd_expected)[N],
12290 FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
Martyn Capewell37f28182020-01-14 10:15:10 +000012291 VIXL_ASSERT((macro == NULL) ^ (macro_nonan == NULL));
TatWai Chongd316c5e2019-10-16 12:22:10 -070012292 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12293 START();
12294
TatWai Chong7a0d3672019-10-23 17:35:18 -070012295 // Avoid choosing default scratch registers.
12296 ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12297 ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12298 ZRegister zm = z28.WithLaneSize(lane_size_in_bits);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012299
TatWai Chong7a0d3672019-10-23 17:35:18 -070012300 uint64_t zn_inputs_rawbits[N];
12301 uint64_t zm_inputs_rawbits[N];
12302 uint64_t zd_inputs_rawbits[N];
TatWai Chongd316c5e2019-10-16 12:22:10 -070012303
TatWai Chong7a0d3672019-10-23 17:35:18 -070012304 FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
12305 FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
12306 FPToRawbitsWithSize(zd_inputs, zd_inputs_rawbits, lane_size_in_bits);
12307
12308 InsrHelper(&masm, zn, zn_inputs_rawbits);
12309 InsrHelper(&masm, zm, zm_inputs_rawbits);
12310 InsrHelper(&masm, zd, zd_inputs_rawbits);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012311
12312 PRegisterWithLaneSize pg = p0.WithLaneSize(lane_size_in_bits);
12313 Initialise(&masm, pg, pg_inputs);
12314
12315 // `instr` zdn, pg, zdn, zm
12316 ZRegister dn_result = z0.WithLaneSize(lane_size_in_bits);
12317 __ Mov(dn_result, zn);
Martyn Capewell37f28182020-01-14 10:15:10 +000012318 if (macro_nonan == NULL) {
12319 (masm.*macro)(dn_result, pg.Merging(), dn_result, zm, nan_option);
12320 } else {
12321 (masm.*macro_nonan)(dn_result, pg.Merging(), dn_result, zm);
12322 }
TatWai Chongd316c5e2019-10-16 12:22:10 -070012323
12324 // Based on whether zd and zm registers are aliased, the macro of instructions
12325 // (`Instr`) swaps the order of operands if it has the commutative property,
12326 // otherwise, transfer to the reversed `Instr`, such as fdivr.
12327 // `instr` zdm, pg, zn, zdm
12328 ZRegister dm_result = z1.WithLaneSize(lane_size_in_bits);
12329 __ Mov(dm_result, zm);
Martyn Capewell37f28182020-01-14 10:15:10 +000012330 if (macro_nonan == NULL) {
12331 (masm.*macro)(dm_result, pg.Merging(), zn, dm_result, nan_option);
12332 } else {
12333 (masm.*macro_nonan)(dm_result, pg.Merging(), zn, dm_result);
12334 }
TatWai Chongd316c5e2019-10-16 12:22:10 -070012335
12336 // The macro of instructions (`Instr`) automatically selects between `instr`
12337 // and movprfx + `instr` based on whether zd and zn registers are aliased.
12338 // A generated movprfx instruction is predicated that using the same
12339 // governing predicate register. In order to keep the result constant,
12340 // initialize the destination register first.
12341 // `instr` zd, pg, zn, zm
12342 ZRegister d_result = z2.WithLaneSize(lane_size_in_bits);
12343 __ Mov(d_result, zd);
Martyn Capewell37f28182020-01-14 10:15:10 +000012344 if (macro_nonan == NULL) {
12345 (masm.*macro)(d_result, pg.Merging(), zn, zm, nan_option);
12346 } else {
12347 (masm.*macro_nonan)(d_result, pg.Merging(), zn, zm);
12348 }
TatWai Chongd316c5e2019-10-16 12:22:10 -070012349
12350 END();
12351
12352 if (CAN_RUN()) {
12353 RUN();
12354
12355 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12356 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12357 if (!core.HasSVELane(dn_result, lane)) break;
12358 if ((pg_inputs[i] & 1) != 0) {
12359 ASSERT_EQUAL_SVE_LANE(zd_expected[i], dn_result, lane);
12360 } else {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012361 ASSERT_EQUAL_SVE_LANE(zn_inputs_rawbits[i], dn_result, lane);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012362 }
12363 }
12364
12365 for (size_t i = 0; i < ArrayLength(zd_expected); i++) {
12366 int lane = static_cast<int>(ArrayLength(zd_expected) - i - 1);
12367 if (!core.HasSVELane(dm_result, lane)) break;
12368 if ((pg_inputs[i] & 1) != 0) {
12369 ASSERT_EQUAL_SVE_LANE(zd_expected[i], dm_result, lane);
12370 } else {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012371 ASSERT_EQUAL_SVE_LANE(zm_inputs_rawbits[i], dm_result, lane);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012372 }
12373 }
12374
12375 ASSERT_EQUAL_SVE(zd_expected, d_result);
12376 }
12377}
12378
12379TEST_SVE(sve_binary_arithmetic_predicated_fdiv) {
TatWai Chong7a0d3672019-10-23 17:35:18 -070012380 // The inputs are shared with different precision tests.
TatWai Chongd316c5e2019-10-16 12:22:10 -070012381 double zd_in[] = {0.1, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9};
12382
12383 double zn_in[] = {24.0,
12384 24.0,
12385 -2.0,
12386 -2.0,
12387 5.5,
12388 5.5,
12389 kFP64PositiveInfinity,
12390 kFP64PositiveInfinity,
12391 kFP64NegativeInfinity,
12392 kFP64NegativeInfinity};
12393
12394 double zm_in[] = {-2.0, -2.0, 24.0, 24.0, 0.5, 0.5, 0.65, 0.65, 24.0, 24.0};
12395
TatWai Chongd316c5e2019-10-16 12:22:10 -070012396 int pg_in[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
12397
TatWai Chong7a0d3672019-10-23 17:35:18 -070012398 uint16_t exp_h[] = {Float16ToRawbits(Float16(0.1)),
TatWai Chongd316c5e2019-10-16 12:22:10 -070012399 Float16ToRawbits(Float16(-12.0)),
12400 Float16ToRawbits(Float16(2.2)),
12401 Float16ToRawbits(Float16(-0.0833)),
12402 Float16ToRawbits(Float16(4.4)),
12403 Float16ToRawbits(Float16(11.0)),
12404 Float16ToRawbits(Float16(6.6)),
12405 Float16ToRawbits(kFP16PositiveInfinity),
12406 Float16ToRawbits(Float16(8.8)),
12407 Float16ToRawbits(kFP16NegativeInfinity)};
12408
TatWai Chong7a0d3672019-10-23 17:35:18 -070012409 FPBinArithHelper(config,
Martyn Capewell37f28182020-01-14 10:15:10 +000012410 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012411 &MacroAssembler::Fdiv,
12412 kHRegSize,
12413 zd_in,
12414 pg_in,
12415 zn_in,
12416 zm_in,
12417 exp_h);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012418
12419 uint32_t exp_s[] = {FloatToRawbits(0.1),
12420 FloatToRawbits(-12.0),
12421 FloatToRawbits(2.2),
12422 0xbdaaaaab,
12423 FloatToRawbits(4.4),
12424 FloatToRawbits(11.0),
12425 FloatToRawbits(6.6),
12426 FloatToRawbits(kFP32PositiveInfinity),
12427 FloatToRawbits(8.8),
12428 FloatToRawbits(kFP32NegativeInfinity)};
12429
TatWai Chong7a0d3672019-10-23 17:35:18 -070012430 FPBinArithHelper(config,
Martyn Capewell37f28182020-01-14 10:15:10 +000012431 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012432 &MacroAssembler::Fdiv,
12433 kSRegSize,
12434 zd_in,
12435 pg_in,
12436 zn_in,
12437 zm_in,
12438 exp_s);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012439
12440 uint64_t exp_d[] = {DoubleToRawbits(0.1),
12441 DoubleToRawbits(-12.0),
12442 DoubleToRawbits(2.2),
12443 0xbfb5555555555555,
12444 DoubleToRawbits(4.4),
12445 DoubleToRawbits(11.0),
12446 DoubleToRawbits(6.6),
12447 DoubleToRawbits(kFP64PositiveInfinity),
12448 DoubleToRawbits(8.8),
12449 DoubleToRawbits(kFP64NegativeInfinity)};
12450
TatWai Chong7a0d3672019-10-23 17:35:18 -070012451 FPBinArithHelper(config,
Martyn Capewell37f28182020-01-14 10:15:10 +000012452 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012453 &MacroAssembler::Fdiv,
12454 kDRegSize,
12455 zd_in,
12456 pg_in,
12457 zn_in,
12458 zm_in,
12459 exp_d);
TatWai Chongd316c5e2019-10-16 12:22:10 -070012460}
12461
Martyn Capewell9cc3f142019-10-29 14:06:35 +000012462TEST_SVE(sve_select) {
12463 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12464 START();
12465
12466 uint64_t in0[] = {0x01f203f405f607f8, 0xfefcf8f0e1c3870f, 0x123456789abcdef0};
12467 uint64_t in1[] = {0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa};
12468
12469 // For simplicity, we re-use the same pg for various lane sizes.
12470 // For D lanes: 1, 1, 0
12471 // For S lanes: 1, 1, 1, 0, 0
12472 // For H lanes: 0, 1, 0, 1, 1, 1, 0, 0, 1, 0
12473 int pg_in[] = {1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0};
12474 Initialise(&masm, p0.VnB(), pg_in);
12475 PRegisterM pg = p0.Merging();
12476
12477 InsrHelper(&masm, z30.VnD(), in0);
12478 InsrHelper(&masm, z31.VnD(), in1);
12479
12480 __ Sel(z0.VnB(), pg, z30.VnB(), z31.VnB());
12481 __ Sel(z1.VnH(), pg, z30.VnH(), z31.VnH());
12482 __ Sel(z2.VnS(), pg, z30.VnS(), z31.VnS());
12483 __ Sel(z3.VnD(), pg, z30.VnD(), z31.VnD());
12484
12485 END();
12486
12487 if (CAN_RUN()) {
12488 RUN();
12489
12490 uint64_t expected_z0[] = {0xaaaaaaaa05aa07f8,
12491 0xfeaaaaf0aac3870f,
12492 0xaaaa56aa9abcdeaa};
12493 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
12494
12495 uint64_t expected_z1[] = {0xaaaaaaaaaaaa07f8,
12496 0xaaaaf8f0e1c3870f,
12497 0xaaaaaaaa9abcaaaa};
12498 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
12499
12500 uint64_t expected_z2[] = {0xaaaaaaaa05f607f8,
12501 0xfefcf8f0e1c3870f,
12502 0xaaaaaaaaaaaaaaaa};
12503 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
12504
12505 uint64_t expected_z3[] = {0x01f203f405f607f8,
12506 0xfefcf8f0e1c3870f,
12507 0xaaaaaaaaaaaaaaaa};
12508 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
12509 }
12510}
TatWai Chongd316c5e2019-10-16 12:22:10 -070012511
TatWai Chong7a0d3672019-10-23 17:35:18 -070012512TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_h) {
12513 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12514 double zn_inputs[] = {-2.1,
12515 8.5,
12516 225.5,
12517 0.0,
12518 8.8,
12519 -4.75,
12520 kFP64PositiveInfinity,
12521 kFP64NegativeInfinity};
12522 double zm_inputs[] = {-2.0,
12523 -13.0,
12524 24.0,
12525 0.01,
12526 0.5,
12527 300.75,
12528 kFP64NegativeInfinity,
12529 kFP64PositiveInfinity};
12530 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12531
12532 uint16_t zd_expected_max[] = {Float16ToRawbits(Float16(-2.0)),
12533 Float16ToRawbits(Float16(8.5)),
12534 Float16ToRawbits(Float16(3.3)),
12535 Float16ToRawbits(Float16(0.01)),
12536 Float16ToRawbits(Float16(5.5)),
12537 Float16ToRawbits(Float16(300.75)),
12538 Float16ToRawbits(kFP16PositiveInfinity),
12539 Float16ToRawbits(kFP16PositiveInfinity)};
12540 FPBinArithHelper(config,
12541 &MacroAssembler::Fmax,
Martyn Capewell37f28182020-01-14 10:15:10 +000012542 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012543 kHRegSize,
12544 zd_inputs,
12545 pg_inputs,
12546 zn_inputs,
12547 zm_inputs,
12548 zd_expected_max);
12549
12550 uint16_t zd_expected_min[] = {Float16ToRawbits(Float16(-2.1)),
12551 Float16ToRawbits(Float16(-13.0)),
12552 Float16ToRawbits(Float16(3.3)),
12553 Float16ToRawbits(Float16(0.0)),
12554 Float16ToRawbits(Float16(5.5)),
12555 Float16ToRawbits(Float16(-4.75)),
12556 Float16ToRawbits(kFP16NegativeInfinity),
12557 Float16ToRawbits(kFP16NegativeInfinity)};
12558 FPBinArithHelper(config,
12559 &MacroAssembler::Fmin,
Martyn Capewell37f28182020-01-14 10:15:10 +000012560 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012561 kHRegSize,
12562 zd_inputs,
12563 pg_inputs,
12564 zn_inputs,
12565 zm_inputs,
12566 zd_expected_min);
12567}
12568
12569TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_s) {
12570 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12571 double zn_inputs[] = {-2.1,
12572 8.5,
12573 225.5,
12574 0.0,
12575 8.8,
12576 -4.75,
12577 kFP64PositiveInfinity,
12578 kFP64NegativeInfinity};
12579 double zm_inputs[] = {-2.0,
12580 -13.0,
12581 24.0,
12582 0.01,
12583 0.5,
12584 300.75,
12585 kFP64NegativeInfinity,
12586 kFP64PositiveInfinity};
12587 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12588
12589 uint32_t zd_expected_max[] = {FloatToRawbits(-2.0),
12590 FloatToRawbits(8.5),
12591 FloatToRawbits(3.3),
12592 FloatToRawbits(0.01),
12593 FloatToRawbits(5.5),
12594 FloatToRawbits(300.75),
12595 FloatToRawbits(kFP32PositiveInfinity),
12596 FloatToRawbits(kFP32PositiveInfinity)};
12597 FPBinArithHelper(config,
12598 &MacroAssembler::Fmax,
Martyn Capewell37f28182020-01-14 10:15:10 +000012599 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012600 kSRegSize,
12601 zd_inputs,
12602 pg_inputs,
12603 zn_inputs,
12604 zm_inputs,
12605 zd_expected_max);
12606
12607 uint32_t zd_expected_min[] = {FloatToRawbits(-2.1),
12608 FloatToRawbits(-13.0),
12609 FloatToRawbits(3.3),
12610 FloatToRawbits(0.0),
12611 FloatToRawbits(5.5),
12612 FloatToRawbits(-4.75),
12613 FloatToRawbits(kFP32NegativeInfinity),
12614 FloatToRawbits(kFP32NegativeInfinity)};
12615 FPBinArithHelper(config,
12616 &MacroAssembler::Fmin,
Martyn Capewell37f28182020-01-14 10:15:10 +000012617 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012618 kSRegSize,
12619 zd_inputs,
12620 pg_inputs,
12621 zn_inputs,
12622 zm_inputs,
12623 zd_expected_min);
12624}
12625
12626TEST_SVE(sve_binary_arithmetic_predicated_fmax_fmin_d) {
12627 double zd_inputs[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8};
12628 double zn_inputs[] = {-2.1,
12629 8.5,
12630 225.5,
12631 0.0,
12632 8.8,
12633 -4.75,
12634 kFP64PositiveInfinity,
12635 kFP64NegativeInfinity};
12636 double zm_inputs[] = {-2.0,
12637 -13.0,
12638 24.0,
12639 0.01,
12640 0.5,
12641 300.75,
12642 kFP64NegativeInfinity,
12643 kFP64PositiveInfinity};
12644 int pg_inputs[] = {1, 1, 0, 1, 0, 1, 1, 1};
12645
12646 uint64_t zd_expected_max[] = {DoubleToRawbits(-2.0),
12647 DoubleToRawbits(8.5),
12648 DoubleToRawbits(3.3),
12649 DoubleToRawbits(0.01),
12650 DoubleToRawbits(5.5),
12651 DoubleToRawbits(300.75),
12652 DoubleToRawbits(kFP64PositiveInfinity),
12653 DoubleToRawbits(kFP64PositiveInfinity)};
12654 FPBinArithHelper(config,
12655 &MacroAssembler::Fmax,
Martyn Capewell37f28182020-01-14 10:15:10 +000012656 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012657 kDRegSize,
12658 zd_inputs,
12659 pg_inputs,
12660 zn_inputs,
12661 zm_inputs,
12662 zd_expected_max);
12663
12664 uint64_t zd_expected_min[] = {DoubleToRawbits(-2.1),
12665 DoubleToRawbits(-13.0),
12666 DoubleToRawbits(3.3),
12667 DoubleToRawbits(0.0),
12668 DoubleToRawbits(5.5),
12669 DoubleToRawbits(-4.75),
12670 DoubleToRawbits(kFP64NegativeInfinity),
12671 DoubleToRawbits(kFP64NegativeInfinity)};
12672 FPBinArithHelper(config,
12673 &MacroAssembler::Fmin,
Martyn Capewell37f28182020-01-14 10:15:10 +000012674 NULL,
TatWai Chong7a0d3672019-10-23 17:35:18 -070012675 kDRegSize,
12676 zd_inputs,
12677 pg_inputs,
12678 zn_inputs,
12679 zm_inputs,
12680 zd_expected_min);
12681}
TatWai Chong29a0c432019-11-06 22:20:44 -080012682
12683template <typename T, size_t N>
12684static void BitwiseShiftImmHelper(Test* config,
12685 int lane_size_in_bits,
12686 const T (&zn_inputs)[N],
12687 int shift) {
12688 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12689 START();
12690
12691 ZRegister zd_asr = z25.WithLaneSize(lane_size_in_bits);
12692 ZRegister zd_lsr = z26.WithLaneSize(lane_size_in_bits);
12693 ZRegister zd_lsl = z27.WithLaneSize(lane_size_in_bits);
12694 ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
12695
12696 InsrHelper(&masm, zn, zn_inputs);
12697
12698 __ Asr(zd_asr, zn, shift);
12699 __ Lsr(zd_lsr, zn, shift);
Martyn Capewell147b0ba2020-02-19 11:16:02 +000012700 __ Lsl(zd_lsl, zn, shift - 1); // Lsl supports 0 - lane_size-1.
TatWai Chong29a0c432019-11-06 22:20:44 -080012701
12702 END();
12703
12704 if (CAN_RUN()) {
12705 RUN();
12706
12707 const uint64_t mask = GetUintMask(lane_size_in_bits);
12708 for (int i = 0; i < static_cast<int>(N); i++) {
12709 int lane = N - i - 1;
12710 if (!core.HasSVELane(zd_asr, lane)) break;
12711 bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12712 uint64_t result;
12713 if (shift >= lane_size_in_bits) {
12714 result = is_negative ? mask : 0;
12715 } else {
12716 result = zn_inputs[i] >> shift;
12717 if (is_negative) {
12718 result |= mask << (lane_size_in_bits - shift);
12719 result &= mask;
12720 }
12721 }
12722 ASSERT_EQUAL_SVE_LANE(result, zd_asr, lane);
12723 }
12724
12725 for (int i = 0; i < static_cast<int>(N); i++) {
12726 int lane = N - i - 1;
12727 if (!core.HasSVELane(zd_lsr, lane)) break;
12728 uint64_t result =
12729 (shift >= lane_size_in_bits) ? 0 : zn_inputs[i] >> shift;
12730 ASSERT_EQUAL_SVE_LANE(result, zd_lsr, lane);
12731 }
12732
12733 for (int i = 0; i < static_cast<int>(N); i++) {
12734 int lane = N - i - 1;
12735 if (!core.HasSVELane(zd_lsl, lane)) break;
Jacob Bramley504d5e92020-05-21 11:40:21 +010012736 uint64_t result =
12737 (shift > lane_size_in_bits) ? 0 : zn_inputs[i] << (shift - 1);
TatWai Chong29a0c432019-11-06 22:20:44 -080012738 ASSERT_EQUAL_SVE_LANE(result & mask, zd_lsl, lane);
12739 }
12740 }
12741}
12742
12743TEST_SVE(sve_bitwise_shift_imm_unpredicated) {
12744 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12745 int shift_b[] = {1, 3, 5, 8};
12746 for (size_t i = 0; i < ArrayLength(shift_b); i++) {
12747 BitwiseShiftImmHelper(config, kBRegSize, inputs_b, shift_b[i]);
12748 }
12749
12750 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233};
12751 int shift_h[] = {1, 8, 11, 16};
12752 for (size_t i = 0; i < ArrayLength(shift_h); i++) {
12753 BitwiseShiftImmHelper(config, kHRegSize, inputs_h, shift_h[i]);
12754 }
12755
12756 uint64_t inputs_s[] = {0xfedcba98, 0xfffa55aa, 0x00112233};
12757 int shift_s[] = {1, 9, 17, 32};
12758 for (size_t i = 0; i < ArrayLength(shift_s); i++) {
12759 BitwiseShiftImmHelper(config, kSRegSize, inputs_s, shift_s[i]);
12760 }
12761
12762 uint64_t inputs_d[] = {0xfedcba98fedcba98,
12763 0xfffa5555aaaaaaaa,
12764 0x0011223344aafe80};
12765 int shift_d[] = {1, 23, 45, 64};
12766 for (size_t i = 0; i < ArrayLength(shift_d); i++) {
12767 BitwiseShiftImmHelper(config, kDRegSize, inputs_d, shift_d[i]);
12768 }
12769}
12770
12771template <typename T, typename R, size_t N>
12772static void BitwiseShiftWideElementsHelper(Test* config,
12773 Shift shift_type,
12774 int lane_size_in_bits,
12775 const T (&zn_inputs)[N],
12776 const R& zm_inputs,
12777 const T (&zd_expected)[N]) {
12778 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12779 START();
12780
12781 ArithFn macro;
12782 // Since logical shift left and right by the current lane size width is equal
12783 // to 0, so initialize the array to 0 for convenience.
12784 uint64_t zd_expected_max_shift_amount[N] = {0};
12785 switch (shift_type) {
12786 case ASR: {
12787 macro = &MacroAssembler::Asr;
12788 uint64_t mask = GetUintMask(lane_size_in_bits);
12789 for (size_t i = 0; i < ArrayLength(zn_inputs); i++) {
12790 bool is_negative = (zn_inputs[i] & GetSignMask(lane_size_in_bits)) != 0;
12791 zd_expected_max_shift_amount[i] = is_negative ? mask : 0;
12792 }
12793 break;
12794 }
12795 case LSR:
12796 macro = &MacroAssembler::Lsr;
12797 break;
12798 case LSL:
12799 macro = &MacroAssembler::Lsl;
12800 break;
12801 default:
12802 VIXL_UNIMPLEMENTED();
12803 macro = NULL;
12804 break;
12805 }
12806
12807 ZRegister zd = z26.WithLaneSize(lane_size_in_bits);
12808 ZRegister zn = z27.WithLaneSize(lane_size_in_bits);
12809 ZRegister zm = z28.WithLaneSize(kDRegSize);
12810
12811 InsrHelper(&masm, zn, zn_inputs);
12812 InsrHelper(&masm, zm, zm_inputs);
12813
12814 (masm.*macro)(zd, zn, zm);
12815
12816 ZRegister zm_max_shift_amount = z25.WithLaneSize(kDRegSize);
12817 ZRegister zd_max_shift_amount = z24.WithLaneSize(lane_size_in_bits);
12818
12819 __ Dup(zm_max_shift_amount, lane_size_in_bits);
12820 (masm.*macro)(zd_max_shift_amount, zn, zm_max_shift_amount);
12821
12822 ZRegister zm_out_of_range = z23.WithLaneSize(kDRegSize);
12823 ZRegister zd_out_of_range = z22.WithLaneSize(lane_size_in_bits);
12824
12825 __ Dup(zm_out_of_range, GetUintMask(lane_size_in_bits));
12826 (masm.*macro)(zd_out_of_range, zn, zm_out_of_range);
12827
12828 END();
12829
12830 if (CAN_RUN()) {
12831 RUN();
12832
12833 ASSERT_EQUAL_SVE(zd_expected, zd);
12834 ASSERT_EQUAL_SVE(zd_expected_max_shift_amount, zd_max_shift_amount);
12835 ASSERT_EQUAL_SVE(zd_max_shift_amount, zd_out_of_range);
12836 }
12837}
12838
12839TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_asr) {
12840 // clang-format off
12841 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12842 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12843 int shift_b[] = {1, 3};
12844 uint64_t expected_b[] = {0xff, 0xee, 0xdd, 0xcc, 0xff, 0x2a, 0xd5, 0xc0,
12845 0xff, 0xfb, 0xf7, 0xf3, 0xff, 0x0a, 0xf5, 0xf0};
12846 BitwiseShiftWideElementsHelper(config,
12847 ASR,
12848 kBRegSize,
12849 inputs_b,
12850 shift_b,
12851 expected_b);
12852
12853 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12854 0xfedc, 0xfa55, 0x0011, 0x2233,
12855 0xfedc, 0xfa55, 0x0011, 0x2233};
12856 int shift_h[] = {1, 8, 11};
12857 uint64_t expected_h[] = {0xff6e, 0xfd2a, 0x0008, 0x1119,
12858 0xfffe, 0xfffa, 0x0000, 0x0022,
12859 0xffff, 0xffff, 0x0000, 0x0004};
12860 BitwiseShiftWideElementsHelper(config,
12861 ASR,
12862 kHRegSize,
12863 inputs_h,
12864 shift_h,
12865 expected_h);
12866
12867 uint64_t inputs_s[] =
12868 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12869 int shift_s[] = {1, 9, 23};
12870 uint64_t expected_s[] =
12871 {0xff6e5d4c, 0xfffd2ad5, 0x00000891, 0x000091a2, 0xffffff55, 0xffffff11};
12872 BitwiseShiftWideElementsHelper(config,
12873 ASR,
12874 kSRegSize,
12875 inputs_s,
12876 shift_s,
12877 expected_s);
12878 // clang-format on
12879}
12880
12881TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsr) {
12882 // clang-format off
12883 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12884 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12885 int shift_b[] = {1, 3};
12886 uint64_t expected_b[] = {0x7f, 0x6e, 0x5d, 0x4c, 0x7f, 0x2a, 0x55, 0x40,
12887 0x1f, 0x1b, 0x17, 0x13, 0x1f, 0x0a, 0x15, 0x10};
12888
12889 BitwiseShiftWideElementsHelper(config,
12890 LSR,
12891 kBRegSize,
12892 inputs_b,
12893 shift_b,
12894 expected_b);
12895
12896 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12897 0xfedc, 0xfa55, 0x0011, 0x2233,
12898 0xfedc, 0xfa55, 0x0011, 0x2233};
12899 int shift_h[] = {1, 8, 11};
12900 uint64_t expected_h[] = {0x7f6e, 0x7d2a, 0x0008, 0x1119,
12901 0x00fe, 0x00fa, 0x0000, 0x0022,
12902 0x001f, 0x001f, 0x0000, 0x0004};
12903 BitwiseShiftWideElementsHelper(config,
12904 LSR,
12905 kHRegSize,
12906 inputs_h,
12907 shift_h,
12908 expected_h);
12909
12910 uint64_t inputs_s[] =
12911 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12912 int shift_s[] = {1, 9, 23};
12913 uint64_t expected_s[] =
12914 {0x7f6e5d4c, 0x7ffd2ad5, 0x00000891, 0x000091a2, 0x00000155, 0x00000111};
12915 BitwiseShiftWideElementsHelper(config,
12916 LSR,
12917 kSRegSize,
12918 inputs_s,
12919 shift_s,
12920 expected_s);
12921 // clang-format on
12922}
12923
12924TEST_SVE(sve_bitwise_shift_wide_elements_unpredicated_lsl) {
12925 // clang-format off
12926 uint64_t inputs_b[] = {0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80,
12927 0xfe, 0xdc, 0xba, 0x98, 0xff, 0x55, 0xaa, 0x80};
12928 int shift_b[] = {1, 5};
12929
12930 uint64_t expected_b[] = {0xfc, 0xb8, 0x74, 0x30, 0xfe, 0xaa, 0x54, 0x00,
12931 0xc0, 0x80, 0x40, 0x00, 0xe0, 0xa0, 0x40, 0x00};
12932
12933 BitwiseShiftWideElementsHelper(config,
12934 LSL,
12935 kBRegSize,
12936 inputs_b,
12937 shift_b,
12938 expected_b);
12939 uint64_t inputs_h[] = {0xfedc, 0xfa55, 0x0011, 0x2233,
12940 0xfedc, 0xfa55, 0x0011, 0x2233,
12941 0xfedc, 0xfa55, 0x0011, 0x2233};
12942 int shift_h[] = {1, 2, 14};
12943
12944 uint64_t expected_h[] = {0xfdb8, 0xf4aa, 0x0022, 0x4466,
12945 0xfb70, 0xe954, 0x0044, 0x88cc,
12946 0x0000, 0x4000, 0x4000, 0xc000};
12947 BitwiseShiftWideElementsHelper(config,
12948 LSL,
12949 kHRegSize,
12950 inputs_h,
12951 shift_h,
12952 expected_h);
12953 uint64_t inputs_s[] =
12954 {0xfedcba98, 0xfffa55aa, 0x00112233, 0x01234567, 0xaaaaaaaa, 0x88888888};
12955 int shift_s[] = {1, 19, 26};
12956 uint64_t expected_s[] =
12957 {0xfdb97530, 0xfff4ab54, 0x11980000, 0x2b380000, 0xa8000000, 0x20000000};
12958 BitwiseShiftWideElementsHelper(config,
12959 LSL,
12960 kSRegSize,
12961 inputs_s,
12962 shift_s,
12963 expected_s);
Martyn Capewell3bf2d162020-02-17 15:04:36 +000012964
12965 // Test large shifts outside the range of the "unsigned" type.
12966 uint64_t inputs_b2[] = {1, 2, 4, 8, 3, 5, 7, 9,
12967 1, 2, 4, 8, 3, 5, 7, 9};
12968 uint64_t shift_b2[] = {1, 0x1000000001};
12969 uint64_t expected_b2[] = {2, 4, 8, 16, 6, 10, 14, 18,
12970 0, 0, 0, 0, 0, 0, 0, 0};
12971 BitwiseShiftWideElementsHelper(config, LSL, kBRegSize, inputs_b2, shift_b2,
12972 expected_b2);
12973
TatWai Chong29a0c432019-11-06 22:20:44 -080012974 // clang-format on
12975}
12976
Martyn Capewell76c094a2020-02-13 17:26:49 +000012977TEST_SVE(sve_shift_by_vector) {
12978 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
12979
12980 START();
12981 __ Ptrue(p0.VnB());
12982 __ Pfalse(p1.VnB());
12983 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
12984 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
12985 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
12986 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
12987
12988 __ Dup(z31.VnD(), 0x8000000080008080);
12989 __ Dup(z0.VnB(), -1);
12990
12991 __ Index(z1.VnB(), 0, 1);
12992 __ Dup(z2.VnB(), 0x55);
12993 __ Lsr(z2.VnB(), p2.Merging(), z0.VnB(), z1.VnB());
12994 __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnB());
12995 __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnB());
12996
12997 __ Index(z1.VnH(), 0, 1);
12998 __ Dup(z6.VnB(), 0x55);
12999 __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnH());
13000 __ Lsl(z6.VnH(), p3.Merging(), z0.VnH(), z1.VnH());
13001 __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnH());
13002
13003 __ Index(z1.VnS(), 0, 1);
13004 __ Dup(z10.VnB(), 0x55);
13005 __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
13006 __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
13007 __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnS());
13008
13009 __ Index(z1.VnD(), 0, 1);
13010 __ Lsr(z0.VnD(), p5.Merging(), z0.VnD(), z1.VnD());
13011 __ Lsl(z12.VnD(), p0.Merging(), z0.VnD(), z1.VnD());
13012 __ Asr(z13.VnD(), p0.Merging(), z31.VnD(), z1.VnD());
13013
13014 __ Dup(z11.VnD(), 0x100000001);
13015 __ Lsl(z14.VnD(), p0.Merging(), z1.VnD(), z11.VnD());
13016
13017 __ Index(z0.VnH(), 7, -1);
13018 __ Lsr(z0.VnH(), p0.Merging(), z31.VnH(), z0.VnH());
13019 END();
13020
13021 if (CAN_RUN()) {
13022 RUN();
13023
13024 uint64_t expected_z0[] = {0x8000000020001010, 0x0800000002000101};
13025 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13026 uint64_t expected_z2[] = {0x5500550055005500, 0x5503550f553f55ff};
13027 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13028 uint64_t expected_z3[] = {0x0000000000000000, 0x80c0e0f0f8fcfeff};
13029 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13030 uint64_t expected_z4[] = {0xff000000ff00ffff, 0xff000000f000c080};
13031 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13032 uint64_t expected_z5[] = {0x01ff03ff07ff0fff, 0x1fff3fff7fffffff};
13033 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13034 uint64_t expected_z6[] = {0x5555ffc05555fff0, 0x5555fffc5555ffff};
13035 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13036 uint64_t expected_z7[] = {0xff000000fc00f808, 0xf0000000c0008080};
13037 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13038 uint64_t expected_z8[] = {0x1fffffff3fffffff, 0x7fffffffffffffff};
13039 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13040 uint64_t expected_z9[] = {0xfffffff8fffffffc, 0xfffffffeffffffff};
13041 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13042 uint64_t expected_z10[] = {0x55555555e0002020, 0x5555555580008080};
13043 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13044 uint64_t expected_z12[] = {0xfffffffffffffffe, 0xffffffffffffffff};
13045 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13046 uint64_t expected_z13[] = {0xc000000040004040, 0x8000000080008080};
13047 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13048 uint64_t expected_z14[] = {0, 0};
13049 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13050 }
13051}
13052
13053TEST_SVE(sve_shift_by_wide_vector) {
13054 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13055
13056 START();
13057 __ Ptrue(p0.VnB());
13058 __ Pfalse(p1.VnB());
13059 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13060 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13061 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13062
13063 __ Dup(z31.VnD(), 0x8000000080008080);
13064 __ Dup(z0.VnB(), -1);
13065 __ Index(z1.VnD(), 1, 5);
13066
13067 __ Dup(z2.VnB(), 0x55);
13068 __ Lsr(z2.VnB(), p2.Merging(), z2.VnB(), z1.VnD());
13069 __ Lsl(z3.VnB(), p0.Merging(), z0.VnB(), z1.VnD());
13070 __ Asr(z4.VnB(), p0.Merging(), z31.VnB(), z1.VnD());
13071
13072 __ Dup(z6.VnB(), 0x55);
13073 __ Lsr(z5.VnH(), p0.Merging(), z0.VnH(), z1.VnD());
13074 __ Lsl(z6.VnH(), p3.Merging(), z6.VnH(), z1.VnD());
13075 __ Asr(z7.VnH(), p0.Merging(), z31.VnH(), z1.VnD());
13076
13077 __ Dup(z10.VnB(), 0x55);
13078 __ Lsr(z8.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13079 __ Lsl(z9.VnS(), p0.Merging(), z0.VnS(), z1.VnD());
13080 __ Asr(z10.VnS(), p4.Merging(), z31.VnS(), z1.VnD());
13081 END();
13082
13083 if (CAN_RUN()) {
13084 RUN();
13085
13086 uint64_t expected_z2[] = {0x5501550155015501, 0x552a552a552a552a};
13087 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13088 uint64_t expected_z3[] = {0xc0c0c0c0c0c0c0c0, 0xfefefefefefefefe};
13089 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13090 uint64_t expected_z4[] = {0xfe000000fe00fefe, 0xc0000000c000c0c0};
13091 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13092 uint64_t expected_z5[] = {0x03ff03ff03ff03ff, 0x7fff7fff7fff7fff};
13093 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13094 uint64_t expected_z6[] = {0x5555554055555540, 0x5555aaaa5555aaaa};
13095 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13096 uint64_t expected_z7[] = {0xfe000000fe00fe02, 0xc0000000c000c040};
13097 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13098 uint64_t expected_z8[] = {0x03ffffff03ffffff, 0x7fffffff7fffffff};
13099 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13100 uint64_t expected_z9[] = {0xffffffc0ffffffc0, 0xfffffffefffffffe};
13101 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13102 uint64_t expected_z10[] = {0x55555555fe000202, 0x55555555c0004040};
13103 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13104 }
13105}
13106
Martyn Capewell83e86612020-02-19 15:46:15 +000013107TEST_SVE(sve_pred_shift_imm) {
13108 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13109
13110 START();
13111 __ Ptrue(p0.VnB());
13112 __ Pfalse(p1.VnB());
13113 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13114 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13115 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13116 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13117
13118 __ Dup(z31.VnD(), 0x8000000080008080);
13119 __ Lsr(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13120 __ Mov(z1, z0);
13121 __ Lsl(z1.VnB(), p2.Merging(), z1.VnB(), 1);
13122 __ Asr(z2.VnB(), p0.Merging(), z1.VnB(), 2);
13123
13124 __ Lsr(z3.VnH(), p0.Merging(), z31.VnH(), 2);
13125 __ Mov(z4, z3);
13126 __ Lsl(z4.VnH(), p3.Merging(), z4.VnH(), 2);
13127 __ Asr(z5.VnH(), p0.Merging(), z4.VnH(), 3);
13128
13129 __ Lsr(z6.VnS(), p0.Merging(), z31.VnS(), 3);
13130 __ Mov(z7, z6);
13131 __ Lsl(z7.VnS(), p4.Merging(), z7.VnS(), 3);
13132 __ Asr(z8.VnS(), p0.Merging(), z7.VnS(), 4);
13133
13134 __ Lsr(z9.VnD(), p0.Merging(), z31.VnD(), 4);
13135 __ Mov(z10, z9);
13136 __ Lsl(z10.VnD(), p5.Merging(), z10.VnD(), 4);
13137 __ Asr(z11.VnD(), p0.Merging(), z10.VnD(), 5);
13138 END();
13139
13140 if (CAN_RUN()) {
13141 RUN();
13142 uint64_t expected_z0[] = {0x4000000040004040, 0x4000000040004040};
13143 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13144 uint64_t expected_z1[] = {0x4000000040004080, 0x4000000040004080};
13145 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13146 uint64_t expected_z2[] = {0x10000000100010e0, 0x10000000100010e0};
13147 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13148 uint64_t expected_z3[] = {0x2000000020002020, 0x2000000020002020};
13149 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13150 uint64_t expected_z4[] = {0x2000000020008080, 0x2000000020008080};
13151 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13152 uint64_t expected_z5[] = {0x040000000400f010, 0x040000000400f010};
13153 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13154 uint64_t expected_z6[] = {0x1000000010001010, 0x1000000010001010};
13155 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13156 uint64_t expected_z7[] = {0x1000000080008080, 0x1000000080008080};
13157 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13158 uint64_t expected_z8[] = {0x01000000f8000808, 0x01000000f8000808};
13159 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13160 uint64_t expected_z9[] = {0x0800000008000808, 0x0800000008000808};
13161 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13162 uint64_t expected_z10[] = {0x0800000008000808, 0x8000000080008080};
13163 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13164 uint64_t expected_z11[] = {0x0040000000400040, 0xfc00000004000404};
13165 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13166 }
13167}
13168
13169TEST_SVE(sve_asrd) {
13170 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13171
13172 START();
13173 __ Ptrue(p0.VnB());
13174 __ Pfalse(p1.VnB());
13175 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
13176 __ Zip1(p3.VnH(), p0.VnH(), p1.VnH());
13177 __ Zip1(p4.VnS(), p0.VnS(), p1.VnS());
13178 __ Zip1(p5.VnD(), p0.VnD(), p1.VnD());
13179
13180 __ Index(z31.VnB(), 0x7f - 3, 1);
13181 __ Asrd(z0.VnB(), p0.Merging(), z31.VnB(), 1);
13182 __ Mov(z1, z31);
13183 __ Asrd(z1.VnB(), p2.Merging(), z1.VnB(), 2);
13184 __ Asrd(z2.VnB(), p0.Merging(), z31.VnB(), 7);
13185 __ Asrd(z3.VnB(), p0.Merging(), z31.VnB(), 8);
13186
13187 __ Index(z31.VnH(), 0x7fff - 3, 1);
13188 __ Asrd(z4.VnH(), p0.Merging(), z31.VnH(), 1);
13189 __ Mov(z5, z31);
13190 __ Asrd(z5.VnH(), p3.Merging(), z5.VnH(), 2);
13191 __ Asrd(z6.VnH(), p0.Merging(), z31.VnH(), 15);
13192 __ Asrd(z7.VnH(), p0.Merging(), z31.VnH(), 16);
13193
13194 __ Index(z31.VnS(), 0x7fffffff - 1, 1);
13195 __ Asrd(z8.VnS(), p0.Merging(), z31.VnS(), 1);
13196 __ Mov(z9, z31);
13197 __ Asrd(z9.VnS(), p4.Merging(), z9.VnS(), 2);
13198 __ Asrd(z10.VnS(), p0.Merging(), z31.VnS(), 31);
13199 __ Asrd(z11.VnS(), p0.Merging(), z31.VnS(), 32);
13200
13201 __ Index(z31.VnD(), 0x7fffffffffffffff, 1);
13202 __ Asrd(z12.VnD(), p0.Merging(), z31.VnD(), 1);
13203 __ Mov(z13, z31);
13204 __ Asrd(z13.VnD(), p5.Merging(), z13.VnD(), 2);
13205 __ Asrd(z14.VnD(), p0.Merging(), z31.VnD(), 63);
13206 __ Asrd(z31.VnD(), p0.Merging(), z31.VnD(), 64);
13207 END();
13208
13209 if (CAN_RUN()) {
13210 RUN();
13211 uint64_t expected_z0[] = {0xc6c5c5c4c4c3c3c2, 0xc2c1c1c03f3f3e3e};
13212 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
13213 uint64_t expected_z1[] = {0x8be389e287e285e1, 0x83e181e07f1f7d1f};
13214 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13215 uint64_t expected_z2[] = {0x0000000000000000, 0x000000ff00000000};
13216 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13217 uint64_t expected_z3[] = {0x0000000000000000, 0x0000000000000000};
13218 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13219 uint64_t expected_z4[] = {0xc002c001c001c000, 0x3fff3fff3ffe3ffe};
13220 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13221 uint64_t expected_z5[] = {0x8003e0018001e000, 0x7fff1fff7ffd1fff};
13222 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13223 uint64_t expected_z6[] = {0x000000000000ffff, 0x0000000000000000};
13224 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13225 uint64_t expected_z7[] = {0x0000000000000000, 0x0000000000000000};
13226 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13227 uint64_t expected_z8[] = {0xc0000001c0000000, 0x3fffffff3fffffff};
13228 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13229 uint64_t expected_z9[] = {0x80000001e0000000, 0x7fffffff1fffffff};
13230 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13231 uint64_t expected_z10[] = {0x00000000ffffffff, 0x0000000000000000};
13232 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13233 uint64_t expected_z11[] = {0x0000000000000000, 0x0000000000000000};
13234 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13235 uint64_t expected_z12[] = {0xc000000000000000, 0x3fffffffffffffff};
13236 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
13237 uint64_t expected_z13[] = {0x8000000000000000, 0x1fffffffffffffff};
13238 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
13239 uint64_t expected_z14[] = {0xffffffffffffffff, 0x0000000000000000};
13240 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
13241 uint64_t expected_z31[] = {0x0000000000000000, 0x0000000000000000};
13242 ASSERT_EQUAL_SVE(expected_z31, z31.VnD());
13243 }
13244}
13245
TatWai Chong4023d7a2019-11-18 14:16:28 -080013246TEST_SVE(sve_setffr) {
13247 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13248 START();
13249
13250 __ Ptrue(p15.VnB());
13251 __ Setffr();
13252 __ Rdffr(p14.VnB());
13253
13254 END();
13255
13256 if (CAN_RUN()) {
13257 RUN();
13258
13259 ASSERT_EQUAL_SVE(p14.VnB(), p15.VnB());
13260 }
13261}
13262
13263static void WrffrHelper(Test* config, unsigned active_lanes) {
13264 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13265 START();
13266
13267 int inputs[kPRegMaxSize] = {0};
13268 VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13269 for (unsigned i = 0; i < active_lanes; i++) {
13270 // The rightmost (highest-indexed) array element maps to the lowest-numbered
13271 // lane.
13272 inputs[kPRegMaxSize - i - 1] = 1;
13273 }
13274
13275 Initialise(&masm, p1.VnB(), inputs);
13276 __ Wrffr(p1.VnB());
13277 __ Rdffr(p2.VnB());
13278
13279 END();
13280
13281 if (CAN_RUN()) {
13282 RUN();
13283
13284 ASSERT_EQUAL_SVE(p1.VnB(), p2.VnB());
13285 }
13286}
13287
13288TEST_SVE(sve_wrffr) {
13289 int active_lanes_inputs[] = {0, 1, 7, 10, 32, 48, kPRegMaxSize};
13290 for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13291 WrffrHelper(config, active_lanes_inputs[i]);
13292 }
13293}
13294
TatWai Chonga3e8b172019-11-22 21:48:56 -080013295template <size_t N>
13296static void RdffrHelper(Test* config,
13297 size_t active_lanes,
13298 const int (&pg_inputs)[N]) {
13299 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13300 START();
13301
13302 VIXL_ASSERT(active_lanes <= kPRegMaxSize);
13303
13304 // The rightmost (highest-indexed) array element maps to the lowest-numbered
13305 // lane.
13306 int pd[kPRegMaxSize] = {0};
13307 for (unsigned i = 0; i < active_lanes; i++) {
13308 pd[kPRegMaxSize - i - 1] = 1;
13309 }
13310
13311 int pg[kPRegMaxSize] = {0};
13312 for (unsigned i = 0; i < N; i++) {
13313 pg[kPRegMaxSize - i - 1] = pg_inputs[i];
13314 }
13315
13316 int pd_expected[kPRegMaxSize] = {0};
13317 for (unsigned i = 0; i < std::min(active_lanes, N); i++) {
13318 int lane = kPRegMaxSize - i - 1;
13319 pd_expected[lane] = pd[lane] & pg[lane];
13320 }
13321
13322 Initialise(&masm, p0.VnB(), pg);
13323 Initialise(&masm, p1.VnB(), pd);
13324
13325 // The unpredicated form of rdffr has been tested in `WrffrHelper`.
13326 __ Wrffr(p1.VnB());
13327 __ Rdffr(p14.VnB(), p0.Zeroing());
13328 __ Rdffrs(p13.VnB(), p0.Zeroing());
13329 __ Mrs(x8, NZCV);
13330
13331 END();
13332
13333 if (CAN_RUN()) {
13334 RUN();
13335
13336 ASSERT_EQUAL_SVE(pd_expected, p14.VnB());
13337 ASSERT_EQUAL_SVE(pd_expected, p13.VnB());
13338 StatusFlags nzcv_expected =
13339 GetPredTestFlags(pd_expected, pg, core.GetSVELaneCount(kBRegSize));
13340 ASSERT_EQUAL_64(nzcv_expected, x8);
13341 }
13342}
13343
13344TEST_SVE(sve_rdffr_rdffrs) {
13345 // clang-format off
13346 int active_lanes_inputs[] = {0, 1, 15, 26, 39, 47, kPRegMaxSize};
13347 int pg_inputs_0[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13348 int pg_inputs_1[] = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13349 int pg_inputs_2[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13350 int pg_inputs_3[] = {0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1};
13351 int pg_inputs_4[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13352 // clang-format on
13353
13354 for (size_t i = 0; i < ArrayLength(active_lanes_inputs); i++) {
13355 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_0);
13356 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_1);
13357 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_2);
13358 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_3);
13359 RdffrHelper(config, active_lanes_inputs[i], pg_inputs_4);
13360 }
13361}
13362
TatWai Chong38303d92019-12-02 15:49:29 -080013363typedef void (MacroAssembler::*BrkpFn)(const PRegisterWithLaneSize& pd,
13364 const PRegisterZ& pg,
13365 const PRegisterWithLaneSize& pn,
13366 const PRegisterWithLaneSize& pm);
13367
13368template <typename Tg, typename Tn, typename Td>
13369static void BrkpaBrkpbHelper(Test* config,
13370 BrkpFn macro,
13371 BrkpFn macro_set_flags,
13372 const Tg& pg_inputs,
13373 const Tn& pn_inputs,
13374 const Tn& pm_inputs,
13375 const Td& pd_expected) {
13376 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13377 START();
13378
13379 PRegister pg = p15;
13380 PRegister pn = p14;
13381 PRegister pm = p13;
13382 Initialise(&masm, pg.VnB(), pg_inputs);
13383 Initialise(&masm, pn.VnB(), pn_inputs);
13384 Initialise(&masm, pm.VnB(), pm_inputs);
13385
13386 // Initialise NZCV to an impossible value, to check that we actually write it.
13387 __ Mov(x10, NZCVFlag);
13388 __ Msr(NZCV, x10);
13389
13390 (masm.*macro_set_flags)(p0.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13391 __ Mrs(x0, NZCV);
13392
13393 (masm.*macro)(p1.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
13394
13395 END();
13396
13397 if (CAN_RUN()) {
13398 RUN();
13399
13400 ASSERT_EQUAL_SVE(pd_expected, p0.VnB());
13401
13402 // Check that the flags were properly set.
13403 StatusFlags nzcv_expected =
13404 GetPredTestFlags(pd_expected,
13405 pg_inputs,
13406 core.GetSVELaneCount(kBRegSize));
13407 ASSERT_EQUAL_64(nzcv_expected, x0);
13408 ASSERT_EQUAL_SVE(p0.VnB(), p1.VnB());
13409 }
13410}
13411
13412template <typename Tg, typename Tn, typename Td>
13413static void BrkpaHelper(Test* config,
13414 const Tg& pg_inputs,
13415 const Tn& pn_inputs,
13416 const Tn& pm_inputs,
13417 const Td& pd_expected) {
13418 BrkpaBrkpbHelper(config,
13419 &MacroAssembler::Brkpa,
13420 &MacroAssembler::Brkpas,
13421 pg_inputs,
13422 pn_inputs,
13423 pm_inputs,
13424 pd_expected);
13425}
13426
13427template <typename Tg, typename Tn, typename Td>
13428static void BrkpbHelper(Test* config,
13429 const Tg& pg_inputs,
13430 const Tn& pn_inputs,
13431 const Tn& pm_inputs,
13432 const Td& pd_expected) {
13433 BrkpaBrkpbHelper(config,
13434 &MacroAssembler::Brkpb,
13435 &MacroAssembler::Brkpbs,
13436 pg_inputs,
13437 pn_inputs,
13438 pm_inputs,
13439 pd_expected);
13440}
13441
13442TEST_SVE(sve_brkpb) {
13443 // clang-format off
13444 // The last active element of `pn` are `true` in all vector length configurations.
13445 // | boundary of 128-bits VL.
13446 // v
13447 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13448 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13449 int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13450
13451 // | highest-numbered lane lowest-numbered lane |
13452 // v v
13453 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13454 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13455 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13456
13457 int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13458 int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13459 int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13460
13461 // | first active
13462 // v
13463 int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13464 // | first active
13465 // v
13466 int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13467 // | first active
13468 // v
13469 int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
13470
13471 BrkpbHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13472 BrkpbHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13473 BrkpbHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13474
13475 // | first active
13476 // v
13477 int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13478 // | first active
13479 // v
13480 int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13481 // | first active
13482 // v
13483 int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
13484 BrkpbHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13485 BrkpbHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13486 BrkpbHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13487
13488 // | first active
13489 // v
13490 int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13491 // | first active
13492 // v
13493 int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
13494 // | first active
13495 // v
13496 int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13497 BrkpbHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13498 BrkpbHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13499 BrkpbHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13500
13501 // The last active element of `pn` are `false` in all vector length configurations.
13502 // | last active lane when VL > 128 bits.
13503 // v
13504 // | last active lane when VL == 128 bits.
13505 // v
13506 int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13507 int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13508 BrkpbHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13509 BrkpbHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13510 BrkpbHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13511 // clang-format on
13512}
13513
13514TEST_SVE(sve_brkpa) {
13515 // clang-format off
13516 // The last active element of `pn` are `true` in all vector length configurations.
13517 // | boundary of 128-bits VL.
13518 // v
13519 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13520 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13521 int pg_3[] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13522
13523 // | highest-numbered lane lowest-numbered lane |
13524 // v v
13525 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13526 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
13527 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1};
13528
13529 int pm_1[] = {1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
13530 int pm_2[] = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13531 int pm_3[] = {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13532
13533 // | first active
13534 // v
13535 int exp_1_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13536 // | first active
13537 // v
13538 int exp_1_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
13539 // | first active
13540 // v
13541 int exp_1_3_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
13542
13543 BrkpaHelper(config, pg_1, pn_1, pm_1, exp_1_1_1);
13544 BrkpaHelper(config, pg_1, pn_2, pm_2, exp_1_2_2);
13545 BrkpaHelper(config, pg_1, pn_3, pm_3, exp_1_3_3);
13546
13547 // | first active
13548 // v
13549 int exp_2_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13550 // | first active
13551 // v
13552 int exp_2_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
13553 // | first active
13554 // v
13555 int exp_2_3_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
13556 BrkpaHelper(config, pg_2, pn_1, pm_2, exp_2_1_2);
13557 BrkpaHelper(config, pg_2, pn_2, pm_3, exp_2_2_3);
13558 BrkpaHelper(config, pg_2, pn_3, pm_1, exp_2_3_1);
13559
13560 // | first active
13561 // v
13562 int exp_3_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13563 // | first active
13564 // v
13565 int exp_3_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
13566 // | first active
13567 // v
13568 int exp_3_3_2[] = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13569 BrkpaHelper(config, pg_3, pn_1, pm_3, exp_3_1_3);
13570 BrkpaHelper(config, pg_3, pn_2, pm_1, exp_3_2_1);
13571 BrkpaHelper(config, pg_3, pn_3, pm_2, exp_3_3_2);
13572
13573 // The last active element of `pn` are `false` in all vector length configurations.
13574 // | last active lane when VL > 128 bits.
13575 // v
13576 // | last active lane when VL == 128 bits.
13577 // v
13578 int pg_4[] = {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
13579 int exp_4_x_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13580 BrkpaHelper(config, pg_4, pn_1, pm_1, exp_4_x_x);
13581 BrkpaHelper(config, pg_4, pn_2, pm_2, exp_4_x_x);
13582 BrkpaHelper(config, pg_4, pn_3, pm_3, exp_4_x_x);
13583 // clang-format on
13584}
13585
Martyn Capewell77b6d982019-12-02 18:34:59 +000013586TEST_SVE(sve_rbit) {
13587 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13588 START();
13589
13590 uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13591 InsrHelper(&masm, z0.VnD(), inputs);
13592
13593 __ Ptrue(p1.VnB());
13594 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13595 Initialise(&masm, p2.VnB(), pred);
13596
13597 __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13598 __ Rbit(z0.VnB(), p1.Merging(), z0.VnB());
13599
13600 __ Rbit(z1.VnB(), p1.Merging(), z0.VnB());
13601 __ Rbit(z2.VnH(), p1.Merging(), z0.VnH());
13602 __ Rbit(z3.VnS(), p1.Merging(), z0.VnS());
13603 __ Rbit(z4.VnD(), p1.Merging(), z0.VnD());
13604
13605 __ Dup(z5.VnB(), 0x42);
13606 __ Rbit(z5.VnB(), p2.Merging(), z0.VnB());
13607 __ Dup(z6.VnB(), 0x42);
13608 __ Rbit(z6.VnS(), p2.Merging(), z0.VnS());
13609
13610 END();
13611
13612 if (CAN_RUN()) {
13613 RUN();
13614
13615 ASSERT_EQUAL_SVE(inputs, z0.VnD());
13616
13617 uint64_t expected_z1[] = {0x55555555aaaaaaaa, 0x5555aaaa55aa55aa};
13618 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13619 uint64_t expected_z2[] = {0x55555555aaaaaaaa, 0x5555aaaaaa55aa55};
13620 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13621 uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0xaaaa5555aa55aa55};
13622 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13623 uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0xaa55aa55aaaa5555};
13624 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13625 uint64_t expected_z5[] = {0x4255425542aa42aa, 0x4255424242aa42aa};
13626 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13627 uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0x42424242aa55aa55};
13628 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13629 }
13630}
13631
13632TEST_SVE(sve_rev_bhw) {
13633 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13634 START();
13635
13636 uint64_t inputs[] = {0xaaaaaaaa55555555, 0xaaaa5555aa55aa55};
13637 InsrHelper(&masm, z0.VnD(), inputs);
13638
13639 __ Ptrue(p1.VnB());
13640 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
13641 Initialise(&masm, p2.VnB(), pred);
13642
13643 __ Revb(z1.VnH(), p1.Merging(), z0.VnH());
13644 __ Revb(z2.VnS(), p1.Merging(), z0.VnS());
13645 __ Revb(z3.VnD(), p1.Merging(), z0.VnD());
13646 __ Revh(z4.VnS(), p1.Merging(), z0.VnS());
13647 __ Revh(z5.VnD(), p1.Merging(), z0.VnD());
13648 __ Revw(z6.VnD(), p1.Merging(), z0.VnD());
13649
13650 __ Dup(z7.VnB(), 0x42);
13651 __ Revb(z7.VnH(), p2.Merging(), z0.VnH());
13652 __ Dup(z8.VnB(), 0x42);
13653 __ Revh(z8.VnS(), p2.Merging(), z0.VnS());
13654
13655 END();
13656
13657 if (CAN_RUN()) {
13658 RUN();
13659
13660 uint64_t expected_z1[] = {0xaaaaaaaa55555555, 0xaaaa555555aa55aa};
13661 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
13662 uint64_t expected_z2[] = {0xaaaaaaaa55555555, 0x5555aaaa55aa55aa};
13663 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13664 uint64_t expected_z3[] = {0x55555555aaaaaaaa, 0x55aa55aa5555aaaa};
13665 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13666 uint64_t expected_z4[] = {0xaaaaaaaa55555555, 0x5555aaaaaa55aa55};
13667 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13668 uint64_t expected_z5[] = {0x55555555aaaaaaaa, 0xaa55aa555555aaaa};
13669 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
13670 uint64_t expected_z6[] = {0x55555555aaaaaaaa, 0xaa55aa55aaaa5555};
13671 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13672 uint64_t expected_z7[] = {0xaaaaaaaa55555555, 0xaaaa424255aa55aa};
13673 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13674 uint64_t expected_z8[] = {0xaaaaaaaa55555555, 0x42424242aa55aa55};
13675 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13676 }
13677}
13678
Martyn Capewell43782632019-12-12 13:22:10 +000013679TEST_SVE(sve_ftssel) {
13680 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13681 START();
13682
13683 uint64_t in[] = {0x1111777766665555, 0xaaaabbbbccccdddd};
13684 uint64_t q[] = {0x0001000300000002, 0x0001000200000003};
13685 InsrHelper(&masm, z0.VnD(), in);
13686 InsrHelper(&masm, z1.VnD(), q);
13687
13688 __ Ftssel(z2.VnH(), z0.VnH(), z1.VnH());
13689 __ Ftssel(z3.VnS(), z0.VnS(), z1.VnS());
13690 __ Ftssel(z4.VnD(), z0.VnD(), z1.VnD());
13691
13692 END();
13693
13694 if (CAN_RUN()) {
13695 RUN();
13696
13697 uint64_t expected_z2[] = {0x3c00bc006666d555, 0x3c003bbbccccbc00};
13698 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
13699 uint64_t expected_z3[] = {0xbf800000e6665555, 0x2aaabbbbbf800000};
13700 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
13701 uint64_t expected_z4[] = {0x9111777766665555, 0xbff0000000000000};
13702 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
13703 }
13704}
13705
13706TEST_SVE(sve_fexpa) {
13707 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13708 START();
13709
13710 uint64_t in0[] = {0x3ff0000000000000, 0x3ff0000000011001};
13711 uint64_t in1[] = {0x3ff000000002200f, 0xbff000000003301f};
13712 uint64_t in2[] = {0xbff000000004403f, 0x3ff0000000055040};
13713 uint64_t in3[] = {0x3f800000bf800001, 0x3f80000f3f80001f};
13714 uint64_t in4[] = {0x3f80002f3f82203f, 0xbf8000403f833041};
13715 uint64_t in5[] = {0x3c003c01bc00bc07, 0x3c08bc0f3c1fbc20};
13716 InsrHelper(&masm, z0.VnD(), in0);
13717 InsrHelper(&masm, z1.VnD(), in1);
13718 InsrHelper(&masm, z2.VnD(), in2);
13719 InsrHelper(&masm, z3.VnD(), in3);
13720 InsrHelper(&masm, z4.VnD(), in4);
13721 InsrHelper(&masm, z5.VnD(), in5);
13722
13723 __ Fexpa(z6.VnD(), z0.VnD());
13724 __ Fexpa(z7.VnD(), z1.VnD());
13725 __ Fexpa(z8.VnD(), z2.VnD());
13726 __ Fexpa(z9.VnS(), z3.VnS());
13727 __ Fexpa(z10.VnS(), z4.VnS());
13728 __ Fexpa(z11.VnH(), z5.VnH());
13729
13730 END();
13731
13732 if (CAN_RUN()) {
13733 RUN();
13734 uint64_t expected_z6[] = {0x0000000000000000, 0x44002c9a3e778061};
13735 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
13736 uint64_t expected_z7[] = {0x0802d285a6e4030b, 0x4c06623882552225};
13737 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
13738 uint64_t expected_z8[] = {0x100fa7c1819e90d8, 0x5410000000000000};
13739 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
13740 uint64_t expected_z9[] = {0x00000000000164d2, 0x0016942d003311c4};
13741 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
13742 uint64_t expected_z10[] = {0x0054f35b407d3e0c, 0x00800000608164d2};
13743 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
13744 uint64_t expected_z11[] = {0x00000016000000a8, 0x00c2018903d40400};
13745 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
13746 }
13747}
13748
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000013749TEST_SVE(sve_rev_p) {
13750 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13751 START();
13752
13753 Initialise(&masm,
13754 p0.VnB(),
13755 0xabcdabcdabcdabcd,
13756 0xabcdabcdabcdabcd,
13757 0xabcdabcdabcdabcd,
13758 0xabcdabcdabcdabcd);
13759
13760 __ Rev(p1.VnB(), p0.VnB());
13761 __ Rev(p2.VnH(), p0.VnH());
13762 __ Rev(p3.VnS(), p0.VnS());
13763 __ Rev(p4.VnD(), p0.VnD());
13764
13765 END();
13766
13767 if (CAN_RUN()) {
13768 RUN();
13769
13770 int p1_expected[] = {1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1};
13771 ASSERT_EQUAL_SVE(p1_expected, p1.VnB());
13772 int p2_expected[] = {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0};
13773 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13774 int p3_expected[] = {1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0};
13775 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13776 int p4_expected[] = {1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1};
13777 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13778 }
13779}
13780
13781TEST_SVE(sve_trn_p_bh) {
13782 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13783 START();
13784
13785 Initialise(&masm, p0.VnB(), 0xa5a55a5a);
13786 __ Pfalse(p1.VnB());
13787
13788 __ Trn1(p2.VnB(), p0.VnB(), p0.VnB());
13789 __ Trn2(p3.VnB(), p0.VnB(), p0.VnB());
13790 __ Trn1(p4.VnB(), p1.VnB(), p0.VnB());
13791 __ Trn2(p5.VnB(), p1.VnB(), p0.VnB());
13792 __ Trn1(p6.VnB(), p0.VnB(), p1.VnB());
13793 __ Trn2(p7.VnB(), p0.VnB(), p1.VnB());
13794
13795 __ Trn1(p8.VnH(), p0.VnH(), p0.VnH());
13796 __ Trn2(p9.VnH(), p0.VnH(), p0.VnH());
13797 __ Trn1(p10.VnH(), p1.VnH(), p0.VnH());
13798 __ Trn2(p11.VnH(), p1.VnH(), p0.VnH());
13799 __ Trn1(p12.VnH(), p0.VnH(), p1.VnH());
13800 __ Trn2(p13.VnH(), p0.VnH(), p1.VnH());
13801
13802 END();
13803
13804 if (CAN_RUN()) {
13805 RUN();
13806 int p2_expected[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
13807 int p3_expected[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
13808 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13809 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13810
13811 int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13812 int p5_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13813 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13814 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13815
13816 int p6_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0};
13817 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
13818 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13819 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13820
13821 int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13822 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13823 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13824 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13825
13826 int p10_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13827 int p11_expected[] = {0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0};
13828 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13829 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13830
13831 int p12_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13832 int p13_expected[] = {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0};
13833 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13834 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13835 }
13836}
13837
13838TEST_SVE(sve_trn_p_sd) {
13839 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13840 START();
13841
13842 Initialise(&masm, p0.VnB(), 0x55a55aaa);
13843 __ Pfalse(p1.VnB());
13844
13845 __ Trn1(p2.VnS(), p0.VnS(), p0.VnS());
13846 __ Trn2(p3.VnS(), p0.VnS(), p0.VnS());
13847 __ Trn1(p4.VnS(), p1.VnS(), p0.VnS());
13848 __ Trn2(p5.VnS(), p1.VnS(), p0.VnS());
13849 __ Trn1(p6.VnS(), p0.VnS(), p1.VnS());
13850 __ Trn2(p7.VnS(), p0.VnS(), p1.VnS());
13851
13852 __ Trn1(p8.VnD(), p0.VnD(), p0.VnD());
13853 __ Trn2(p9.VnD(), p0.VnD(), p0.VnD());
13854 __ Trn1(p10.VnD(), p1.VnD(), p0.VnD());
13855 __ Trn2(p11.VnD(), p1.VnD(), p0.VnD());
13856 __ Trn1(p12.VnD(), p0.VnD(), p1.VnD());
13857 __ Trn2(p13.VnD(), p0.VnD(), p1.VnD());
13858
13859 END();
13860
13861 if (CAN_RUN()) {
13862 RUN();
13863 int p2_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13864 int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13865 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13866 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13867
13868 int p4_expected[] = {1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13869 int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13870 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13871 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13872
13873 int p6_expected[] = {0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0};
13874 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13875 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13876 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13877
13878 int p8_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13879 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13880 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13881 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13882
13883 int p10_expected[] = {1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13884 int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
13885 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13886 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13887
13888 int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0};
13889 int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
13890 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13891 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13892 }
13893}
13894
13895TEST_SVE(sve_zip_p_bh) {
13896 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13897 START();
13898
13899 Initialise(&masm,
13900 p0.VnB(),
13901 0x5a5a5a5a5a5a5a5a,
13902 0x5a5a5a5a5a5a5a5a,
13903 0x5a5a5a5a5a5a5a5a,
13904 0x5a5a5a5a5a5a5a5a);
13905 __ Pfalse(p1.VnB());
13906
13907 __ Zip1(p2.VnB(), p0.VnB(), p0.VnB());
13908 __ Zip2(p3.VnB(), p0.VnB(), p0.VnB());
13909 __ Zip1(p4.VnB(), p1.VnB(), p0.VnB());
13910 __ Zip2(p5.VnB(), p1.VnB(), p0.VnB());
13911 __ Zip1(p6.VnB(), p0.VnB(), p1.VnB());
13912 __ Zip2(p7.VnB(), p0.VnB(), p1.VnB());
13913
13914 __ Zip1(p8.VnH(), p0.VnH(), p0.VnH());
13915 __ Zip2(p9.VnH(), p0.VnH(), p0.VnH());
13916 __ Zip1(p10.VnH(), p1.VnH(), p0.VnH());
13917 __ Zip2(p11.VnH(), p1.VnH(), p0.VnH());
13918 __ Zip1(p12.VnH(), p0.VnH(), p1.VnH());
13919 __ Zip2(p13.VnH(), p0.VnH(), p1.VnH());
13920
13921 END();
13922
13923 if (CAN_RUN()) {
13924 RUN();
13925 int p2_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13926 int p3_expected[] = {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0};
13927 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13928 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13929
13930 int p4_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13931 int p5_expected[] = {0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13932 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13933 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13934
13935 int p6_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13936 int p7_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0};
13937 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
13938 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
13939
13940 int p8_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13941 int p9_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13942 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
13943 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
13944
13945 int p10_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13946 int p11_expected[] = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0};
13947 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
13948 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
13949
13950 int p12_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13951 int p13_expected[] = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
13952 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
13953 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
13954 }
13955}
13956
13957TEST_SVE(sve_zip_p_sd) {
13958 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
13959 START();
13960
13961 Initialise(&masm,
13962 p0.VnB(),
13963 0x5a5a5a5a5a5a5a5a,
13964 0x5a5a5a5a5a5a5a5a,
13965 0x5a5a5a5a5a5a5a5a,
13966 0x5a5a5a5a5a5a5a5a);
13967 __ Pfalse(p1.VnB());
13968
13969 __ Zip1(p2.VnS(), p0.VnS(), p0.VnS());
13970 __ Zip2(p3.VnS(), p0.VnS(), p0.VnS());
13971 __ Zip1(p4.VnS(), p1.VnS(), p0.VnS());
13972 __ Zip2(p5.VnS(), p1.VnS(), p0.VnS());
13973 __ Zip1(p6.VnS(), p0.VnS(), p1.VnS());
13974 __ Zip2(p7.VnS(), p0.VnS(), p1.VnS());
13975
13976 __ Zip1(p8.VnD(), p0.VnD(), p0.VnD());
13977 __ Zip2(p9.VnD(), p0.VnD(), p0.VnD());
13978 __ Zip1(p10.VnD(), p1.VnD(), p0.VnD());
13979 __ Zip2(p11.VnD(), p1.VnD(), p0.VnD());
13980 __ Zip1(p12.VnD(), p0.VnD(), p1.VnD());
13981 __ Zip2(p13.VnD(), p0.VnD(), p1.VnD());
13982
13983 END();
13984
13985 if (CAN_RUN()) {
13986 RUN();
13987 int p2_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13988 int p3_expected[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0};
13989 ASSERT_EQUAL_SVE(p2_expected, p2.VnB());
13990 ASSERT_EQUAL_SVE(p3_expected, p3.VnB());
13991
13992 int p4_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13993 int p5_expected[] = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0};
13994 ASSERT_EQUAL_SVE(p4_expected, p4.VnB());
13995 ASSERT_EQUAL_SVE(p5_expected, p5.VnB());
13996
13997 int p6_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13998 int p7_expected[] = {0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0};
13999 ASSERT_EQUAL_SVE(p6_expected, p6.VnB());
14000 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
14001
14002 int p8_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14003 int p9_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14004 ASSERT_EQUAL_SVE(p8_expected, p8.VnB());
14005 ASSERT_EQUAL_SVE(p9_expected, p9.VnB());
14006
14007 int p10_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14008 int p11_expected[] = {0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14009 ASSERT_EQUAL_SVE(p10_expected, p10.VnB());
14010 ASSERT_EQUAL_SVE(p11_expected, p11.VnB());
14011
14012 int p12_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14013 int p13_expected[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14014 ASSERT_EQUAL_SVE(p12_expected, p12.VnB());
14015 ASSERT_EQUAL_SVE(p13_expected, p13.VnB());
14016 }
14017}
14018
14019TEST_SVE(sve_uzp_p) {
14020 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14021 START();
14022
14023 Initialise(&masm,
14024 p0.VnB(),
14025 0xf0f0ff00ffff0000,
14026 0x4242424242424242,
14027 0x5a5a5a5a5a5a5a5a,
14028 0x0123456789abcdef);
14029 __ Rev(p1.VnB(), p0.VnB());
14030
14031 __ Zip1(p2.VnB(), p0.VnB(), p1.VnB());
14032 __ Zip2(p3.VnB(), p0.VnB(), p1.VnB());
14033 __ Uzp1(p4.VnB(), p2.VnB(), p3.VnB());
14034 __ Uzp2(p5.VnB(), p2.VnB(), p3.VnB());
14035
14036 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH());
14037 __ Zip2(p3.VnH(), p0.VnH(), p1.VnH());
14038 __ Uzp1(p6.VnH(), p2.VnH(), p3.VnH());
14039 __ Uzp2(p7.VnH(), p2.VnH(), p3.VnH());
14040
14041 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14042 __ Zip2(p3.VnS(), p0.VnS(), p1.VnS());
14043 __ Uzp1(p8.VnS(), p2.VnS(), p3.VnS());
14044 __ Uzp2(p9.VnS(), p2.VnS(), p3.VnS());
14045
14046 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14047 __ Zip2(p3.VnD(), p0.VnD(), p1.VnD());
14048 __ Uzp1(p10.VnD(), p2.VnD(), p3.VnD());
14049 __ Uzp2(p11.VnD(), p2.VnD(), p3.VnD());
14050
14051 END();
14052
14053 if (CAN_RUN()) {
14054 RUN();
14055
14056 ASSERT_EQUAL_SVE(p0, p4);
14057 ASSERT_EQUAL_SVE(p1, p5);
14058 ASSERT_EQUAL_SVE(p0, p6);
14059 ASSERT_EQUAL_SVE(p1, p7);
14060 ASSERT_EQUAL_SVE(p0, p8);
14061 ASSERT_EQUAL_SVE(p1, p9);
14062 ASSERT_EQUAL_SVE(p0, p10);
14063 ASSERT_EQUAL_SVE(p1, p11);
14064 }
14065}
14066
14067TEST_SVE(sve_punpk) {
14068 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14069 START();
14070
Jacob Bramley3980b742020-07-01 12:25:54 +010014071 auto get_64_bits_at = [](int byte_index) -> uint64_t {
14072 // Each 8-bit chunk has the value 0x50 + the byte index of the chunk.
14073 return 0x5756555453525150 + (0x0101010101010101 * byte_index);
14074 };
14075
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000014076 Initialise(&masm,
14077 p0.VnB(),
Jacob Bramley3980b742020-07-01 12:25:54 +010014078 get_64_bits_at(24),
14079 get_64_bits_at(16),
14080 get_64_bits_at(8),
14081 get_64_bits_at(0));
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000014082 __ Punpklo(p1.VnH(), p0.VnB());
14083 __ Punpkhi(p2.VnH(), p0.VnB());
14084
14085 END();
14086
14087 if (CAN_RUN()) {
14088 RUN();
14089
Jacob Bramley3980b742020-07-01 12:25:54 +010014090 int pl = config->sve_vl_in_bits() / kZRegBitsPerPRegBit;
14091 // For simplicity, just test the bottom 64 H-sized lanes.
14092 uint64_t p1_h_bits = get_64_bits_at(0);
14093 uint64_t p2_h_bits = get_64_bits_at(pl / (2 * 8));
14094 int p1_expected[64];
14095 int p2_expected[64];
14096 for (size_t i = 0; i < 64; i++) {
14097 p1_expected[63 - i] = (p1_h_bits >> i) & 1;
14098 p2_expected[63 - i] = (p2_h_bits >> i) & 1;
14099 }
14100 // Testing `VnH` ensures that odd-numbered B lanes are zero.
14101 ASSERT_EQUAL_SVE(p1_expected, p1.VnH());
14102 ASSERT_EQUAL_SVE(p2_expected, p2.VnH());
Martyn Capewell7fd6fd52019-12-06 14:50:15 +000014103 }
14104}
14105
TatWai Chong5d872292020-01-02 15:39:51 -080014106typedef void (MacroAssembler::*BrkFn)(const PRegisterWithLaneSize& pd,
14107 const PRegister& pg,
14108 const PRegisterWithLaneSize& pn);
14109
14110typedef void (MacroAssembler::*BrksFn)(const PRegisterWithLaneSize& pd,
14111 const PRegisterZ& pg,
14112 const PRegisterWithLaneSize& pn);
14113
14114template <typename T, size_t N>
14115static void BrkaBrkbHelper(Test* config,
14116 BrkFn macro,
14117 BrksFn macro_set_flags,
14118 const T (&pd_inputs)[N],
14119 const T (&pg_inputs)[N],
14120 const T (&pn_inputs)[N],
14121 const T (&pd_z_expected)[N]) {
14122 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14123 START();
14124
14125 PRegister pg = p10;
14126 PRegister pn = p9;
14127 PRegister pd_z = p0;
14128 PRegister pd_z_s = p1;
14129 PRegister pd_m = p2;
14130 Initialise(&masm, pg.VnB(), pg_inputs);
14131 Initialise(&masm, pn.VnB(), pn_inputs);
14132 Initialise(&masm, pd_m.VnB(), pd_inputs);
14133
14134 // Initialise NZCV to an impossible value, to check that we actually write it.
14135 __ Mov(x10, NZCVFlag);
14136 __ Msr(NZCV, x10);
14137
14138 (masm.*macro)(pd_z.VnB(), pg.Zeroing(), pn.VnB());
14139 (masm.*macro_set_flags)(pd_z_s.VnB(), pg.Zeroing(), pn.VnB());
14140 __ Mrs(x0, NZCV);
14141
14142 (masm.*macro)(pd_m.VnB(), pg.Merging(), pn.VnB());
14143
14144 END();
14145
14146 if (CAN_RUN()) {
14147 RUN();
14148
14149 ASSERT_EQUAL_SVE(pd_z_expected, pd_z.VnB());
14150
14151 // Check that the flags were properly set.
14152 StatusFlags nzcv_expected =
14153 GetPredTestFlags(pd_z_expected,
14154 pg_inputs,
14155 core.GetSVELaneCount(kBRegSize));
14156 ASSERT_EQUAL_64(nzcv_expected, x0);
14157 ASSERT_EQUAL_SVE(pd_z.VnB(), pd_z_s.VnB());
14158
14159 T pd_m_expected[N];
14160 // Set expected `pd` result on merging predication.
14161 for (size_t i = 0; i < N; i++) {
14162 pd_m_expected[i] = pg_inputs[i] ? pd_z_expected[i] : pd_inputs[i];
14163 }
14164 ASSERT_EQUAL_SVE(pd_m_expected, pd_m.VnB());
14165 }
14166}
14167
14168template <typename T>
14169static void BrkaHelper(Test* config,
14170 const T& pd_inputs,
14171 const T& pg_inputs,
14172 const T& pn_inputs,
14173 const T& pd_expected) {
14174 BrkaBrkbHelper(config,
14175 &MacroAssembler::Brka,
14176 &MacroAssembler::Brkas,
14177 pd_inputs,
14178 pg_inputs,
14179 pn_inputs,
14180 pd_expected);
14181}
14182
14183TEST_SVE(sve_brka) {
14184 // clang-format off
14185 // | boundary of 128-bits VL.
14186 // v
14187 int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14188
14189 // | highest-numbered lane lowest-numbered lane |
14190 // v v
14191 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14192 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14193
14194 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14195 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14196 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14197
14198 // | first break
14199 // v
14200 int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
14201 // | first break
14202 // v
14203 int exp_1_2[] = {0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14204 // | first break
14205 // v
14206 int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14207
14208 BrkaHelper(config, pd, pg_1, pn_1, exp_1_1);
14209 BrkaHelper(config, pd, pg_1, pn_2, exp_1_2);
14210 BrkaHelper(config, pd, pg_1, pn_3, exp_1_3);
14211
14212 // | first break
14213 // v
14214 int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1};
14215 // | first break
14216 // v
14217 int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14218 // | first break
14219 // v
14220 int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
14221 BrkaHelper(config, pd, pg_2, pn_1, exp_2_1);
14222 BrkaHelper(config, pd, pg_2, pn_2, exp_2_2);
14223 BrkaHelper(config, pd, pg_2, pn_3, exp_2_3);
14224
14225 // The all-inactive zeroing predicate sets destination predicate all-false.
14226 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14227 int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14228 BrkaHelper(config, pd, pg_3, pn_1, exp_3_x);
14229 BrkaHelper(config, pd, pg_3, pn_2, exp_3_x);
14230 BrkaHelper(config, pd, pg_3, pn_3, exp_3_x);
14231 // clang-format on
14232}
14233
14234template <typename T>
14235static void BrkbHelper(Test* config,
14236 const T& pd_inputs,
14237 const T& pg_inputs,
14238 const T& pn_inputs,
14239 const T& pd_expected) {
14240 BrkaBrkbHelper(config,
14241 &MacroAssembler::Brkb,
14242 &MacroAssembler::Brkbs,
14243 pd_inputs,
14244 pg_inputs,
14245 pn_inputs,
14246 pd_expected);
14247}
14248
14249TEST_SVE(sve_brkb) {
14250 // clang-format off
14251 // | boundary of 128-bits VL.
14252 // v
14253 int pd[] = {1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14254
14255 // | highest-numbered lane lowest-numbered lane |
14256 // v v
14257 int pg_1[] = {1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14258 int pg_2[] = {1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14259
14260 int pn_1[] = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};
14261 int pn_2[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14262 int pn_3[] = {1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1};
14263
14264 // | first break
14265 // v
14266 int exp_1_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0};
14267 // | first break
14268 // v
14269 int exp_1_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0};
14270 // | first break
14271 // v
14272 int exp_1_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
14273
14274 BrkbHelper(config, pd, pg_1, pn_1, exp_1_1);
14275 BrkbHelper(config, pd, pg_1, pn_2, exp_1_2);
14276 BrkbHelper(config, pd, pg_1, pn_3, exp_1_3);
14277
14278 // | first break
14279 // v
14280 int exp_2_1[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
14281 // | first break
14282 // v
14283 int exp_2_2[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1};
14284 // | first break
14285 // v
14286 int exp_2_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14287 BrkbHelper(config, pd, pg_2, pn_1, exp_2_1);
14288 BrkbHelper(config, pd, pg_2, pn_2, exp_2_2);
14289 BrkbHelper(config, pd, pg_2, pn_3, exp_2_3);
14290
14291 // The all-inactive zeroing predicate sets destination predicate all-false.
14292 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14293 int exp_3_x[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
14294 BrkbHelper(config, pd, pg_3, pn_1, exp_3_x);
14295 BrkbHelper(config, pd, pg_3, pn_2, exp_3_x);
14296 BrkbHelper(config, pd, pg_3, pn_3, exp_3_x);
14297 // clang-format on
14298}
14299
14300typedef void (MacroAssembler::*BrknFn)(const PRegisterWithLaneSize& pd,
14301 const PRegisterZ& pg,
14302 const PRegisterWithLaneSize& pn,
14303 const PRegisterWithLaneSize& pm);
14304
14305typedef void (MacroAssembler::*BrknsFn)(const PRegisterWithLaneSize& pd,
14306 const PRegisterZ& pg,
14307 const PRegisterWithLaneSize& pn,
14308 const PRegisterWithLaneSize& pm);
14309
14310enum BrknDstPredicateState { kAllFalse, kUnchanged };
14311
14312template <typename T, size_t N>
14313static void BrknHelper(Test* config,
TatWai Chong5d872292020-01-02 15:39:51 -080014314 const T (&pd_inputs)[N],
14315 const T (&pg_inputs)[N],
14316 const T (&pn_inputs)[N],
14317 const T (&pm_inputs)[N],
14318 BrknDstPredicateState expected_pd_state) {
14319 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14320 START();
14321
14322 PRegister pg = p10;
14323 PRegister pn = p9;
14324 PRegister pm = p8;
14325 PRegister pdm = p0;
14326 PRegister pd = p1;
14327 PRegister pd_s = p2;
14328 Initialise(&masm, pg.VnB(), pg_inputs);
14329 Initialise(&masm, pn.VnB(), pn_inputs);
14330 Initialise(&masm, pm.VnB(), pm_inputs);
14331 Initialise(&masm, pdm.VnB(), pm_inputs);
14332 Initialise(&masm, pd.VnB(), pd_inputs);
14333 Initialise(&masm, pd_s.VnB(), pd_inputs);
14334
14335 // Initialise NZCV to an impossible value, to check that we actually write it.
14336 __ Mov(x10, NZCVFlag);
14337 __ Msr(NZCV, x10);
14338
Jacob Bramleya3d61102020-07-01 16:49:47 +010014339 __ Brkn(pdm.VnB(), pg.Zeroing(), pn.VnB(), pdm.VnB());
TatWai Chong5d872292020-01-02 15:39:51 -080014340 // !pd.Aliases(pm).
Jacob Bramleya3d61102020-07-01 16:49:47 +010014341 __ Brkn(pd.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
14342 __ Brkns(pd_s.VnB(), pg.Zeroing(), pn.VnB(), pm.VnB());
TatWai Chong5d872292020-01-02 15:39:51 -080014343 __ Mrs(x0, NZCV);
14344
14345 END();
14346
14347 if (CAN_RUN()) {
14348 RUN();
14349
14350 T all_false[N] = {0};
14351 if (expected_pd_state == kAllFalse) {
14352 ASSERT_EQUAL_SVE(all_false, pd.VnB());
14353 } else {
14354 ASSERT_EQUAL_SVE(pm_inputs, pd.VnB());
14355 }
14356 ASSERT_EQUAL_SVE(pm_inputs, pm.VnB());
14357
Jacob Bramleya3d61102020-07-01 16:49:47 +010014358 T all_true[N];
14359 for (size_t i = 0; i < ArrayLength(all_true); i++) {
14360 all_true[i] = 1;
14361 }
14362
TatWai Chong5d872292020-01-02 15:39:51 -080014363 // Check that the flags were properly set.
14364 StatusFlags nzcv_expected =
14365 GetPredTestFlags((expected_pd_state == kAllFalse) ? all_false
14366 : pm_inputs,
Jacob Bramleya3d61102020-07-01 16:49:47 +010014367 all_true,
TatWai Chong5d872292020-01-02 15:39:51 -080014368 core.GetSVELaneCount(kBRegSize));
14369 ASSERT_EQUAL_64(nzcv_expected, x0);
14370 ASSERT_EQUAL_SVE(pd.VnB(), pdm.VnB());
14371 ASSERT_EQUAL_SVE(pd.VnB(), pd_s.VnB());
14372 }
14373}
14374
14375TEST_SVE(sve_brkn) {
Jacob Bramleya3d61102020-07-01 16:49:47 +010014376 int pd[] = {1, 0, 0, 1, 0, 1, 1, 0, 1, 0};
14377 int pm[] = {0, 1, 1, 1, 1, 0, 0, 1, 0, 1};
TatWai Chong5d872292020-01-02 15:39:51 -080014378
Jacob Bramleya3d61102020-07-01 16:49:47 +010014379 int pg_1[] = {1, 1, 0, 0, 1, 0, 1, 1, 0, 0};
14380 int pg_2[] = {0, 0, 0, 1, 1, 1, 0, 0, 1, 1};
14381 int pg_3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
TatWai Chong5d872292020-01-02 15:39:51 -080014382
Jacob Bramleya3d61102020-07-01 16:49:47 +010014383 int pn_1[] = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0};
14384 int pn_2[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 0};
14385 int pn_3[] = {0, 0, 0, 0, 1, 1, 0, 0, 1, 1};
TatWai Chong5d872292020-01-02 15:39:51 -080014386
Jacob Bramleya3d61102020-07-01 16:49:47 +010014387 BrknHelper(config, pd, pg_1, pn_1, pm, kUnchanged);
14388 BrknHelper(config, pd, pg_1, pn_2, pm, kAllFalse);
14389 BrknHelper(config, pd, pg_1, pn_3, pm, kAllFalse);
TatWai Chong5d872292020-01-02 15:39:51 -080014390
Jacob Bramleya3d61102020-07-01 16:49:47 +010014391 BrknHelper(config, pd, pg_2, pn_1, pm, kAllFalse);
14392 BrknHelper(config, pd, pg_2, pn_2, pm, kUnchanged);
14393 BrknHelper(config, pd, pg_2, pn_3, pm, kAllFalse);
TatWai Chong5d872292020-01-02 15:39:51 -080014394
Jacob Bramleya3d61102020-07-01 16:49:47 +010014395 BrknHelper(config, pd, pg_3, pn_1, pm, kAllFalse);
14396 BrknHelper(config, pd, pg_3, pn_2, pm, kAllFalse);
14397 BrknHelper(config, pd, pg_3, pn_3, pm, kAllFalse);
TatWai Chong5d872292020-01-02 15:39:51 -080014398}
14399
Martyn Capewell15f89012020-01-09 11:18:30 +000014400TEST_SVE(sve_trn) {
14401 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14402 START();
14403
14404 uint64_t in0[] = {0xffeeddccbbaa9988, 0x7766554433221100};
14405 uint64_t in1[] = {0xaa55aa55aa55aa55, 0x55aa55aa55aa55aa};
14406 InsrHelper(&masm, z0.VnD(), in0);
14407 InsrHelper(&masm, z1.VnD(), in1);
14408
14409 __ Trn1(z2.VnB(), z0.VnB(), z1.VnB());
14410 __ Trn2(z3.VnB(), z0.VnB(), z1.VnB());
14411 __ Trn1(z4.VnH(), z0.VnH(), z1.VnH());
14412 __ Trn2(z5.VnH(), z0.VnH(), z1.VnH());
14413 __ Trn1(z6.VnS(), z0.VnS(), z1.VnS());
14414 __ Trn2(z7.VnS(), z0.VnS(), z1.VnS());
14415 __ Trn1(z8.VnD(), z0.VnD(), z1.VnD());
14416 __ Trn2(z9.VnD(), z0.VnD(), z1.VnD());
14417
14418 END();
14419
14420 if (CAN_RUN()) {
14421 RUN();
14422 uint64_t expected_z2[] = {0x55ee55cc55aa5588, 0xaa66aa44aa22aa00};
14423 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14424 uint64_t expected_z3[] = {0xaaffaaddaabbaa99, 0x5577555555335511};
14425 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14426 uint64_t expected_z4[] = {0xaa55ddccaa559988, 0x55aa554455aa1100};
14427 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14428 uint64_t expected_z5[] = {0xaa55ffeeaa55bbaa, 0x55aa776655aa3322};
14429 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14430 uint64_t expected_z6[] = {0xaa55aa55bbaa9988, 0x55aa55aa33221100};
14431 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14432 uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0x55aa55aa77665544};
14433 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14434 uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14435 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14436 uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14437 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14438 }
14439}
14440
14441TEST_SVE(sve_zip_uzp) {
14442 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14443 START();
14444
14445 __ Dup(z0.VnD(), 0xffeeddccbbaa9988);
14446 __ Insr(z0.VnD(), 0x7766554433221100);
14447 __ Dup(z1.VnD(), 0xaa55aa55aa55aa55);
14448 __ Insr(z1.VnD(), 0x55aa55aa55aa55aa);
14449
14450 __ Zip1(z2.VnB(), z0.VnB(), z1.VnB());
14451 __ Zip2(z3.VnB(), z0.VnB(), z1.VnB());
14452 __ Zip1(z4.VnH(), z0.VnH(), z1.VnH());
14453 __ Zip2(z5.VnH(), z0.VnH(), z1.VnH());
14454 __ Zip1(z6.VnS(), z0.VnS(), z1.VnS());
14455 __ Zip2(z7.VnS(), z0.VnS(), z1.VnS());
14456 __ Zip1(z8.VnD(), z0.VnD(), z1.VnD());
14457 __ Zip2(z9.VnD(), z0.VnD(), z1.VnD());
14458
14459 __ Uzp1(z10.VnB(), z2.VnB(), z3.VnB());
14460 __ Uzp2(z11.VnB(), z2.VnB(), z3.VnB());
14461 __ Uzp1(z12.VnH(), z4.VnH(), z5.VnH());
14462 __ Uzp2(z13.VnH(), z4.VnH(), z5.VnH());
14463 __ Uzp1(z14.VnS(), z6.VnS(), z7.VnS());
14464 __ Uzp2(z15.VnS(), z6.VnS(), z7.VnS());
14465 __ Uzp1(z16.VnD(), z8.VnD(), z9.VnD());
14466 __ Uzp2(z17.VnD(), z8.VnD(), z9.VnD());
14467
14468 END();
14469
14470 if (CAN_RUN()) {
14471 RUN();
14472 uint64_t expected_z2[] = {0x5577aa665555aa44, 0x5533aa225511aa00};
14473 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
14474 uint64_t expected_z3[] = {0xaaff55eeaadd55cc, 0xaabb55aaaa995588};
14475 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
14476 uint64_t expected_z4[] = {0x55aa776655aa5544, 0x55aa332255aa1100};
14477 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
14478 uint64_t expected_z5[] = {0xaa55ffeeaa55ddcc, 0xaa55bbaaaa559988};
14479 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
14480 uint64_t expected_z6[] = {0x55aa55aa77665544, 0x55aa55aa33221100};
14481 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14482 uint64_t expected_z7[] = {0xaa55aa55ffeeddcc, 0xaa55aa55bbaa9988};
14483 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14484 uint64_t expected_z8[] = {0x55aa55aa55aa55aa, 0x7766554433221100};
14485 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14486 uint64_t expected_z9[] = {0xaa55aa55aa55aa55, 0xffeeddccbbaa9988};
14487 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14488
14489 // Check uzp is the opposite of zip.
14490 ASSERT_EQUAL_SVE(z0.VnD(), z10.VnD());
14491 ASSERT_EQUAL_SVE(z1.VnD(), z11.VnD());
14492 ASSERT_EQUAL_SVE(z0.VnD(), z12.VnD());
14493 ASSERT_EQUAL_SVE(z1.VnD(), z13.VnD());
14494 ASSERT_EQUAL_SVE(z0.VnD(), z14.VnD());
14495 ASSERT_EQUAL_SVE(z1.VnD(), z15.VnD());
14496 ASSERT_EQUAL_SVE(z0.VnD(), z16.VnD());
14497 ASSERT_EQUAL_SVE(z1.VnD(), z17.VnD());
14498 }
14499}
Martyn Capewell50e9f552020-01-07 17:45:03 +000014500
Martyn Capewell0b1afa82020-03-04 11:31:42 +000014501TEST_SVE(sve_fcadd) {
14502 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14503 START();
14504
14505 __ Dup(z30.VnS(), 0);
14506
14507 __ Ptrue(p0.VnB());
14508 __ Pfalse(p1.VnB());
14509 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
14510 __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
14511
14512 __ Fdup(z0.VnH(), 10.0); // 10i + 10
14513 __ Fdup(z1.VnH(), 5.0); // 5i + 5
14514 __ Index(z7.VnH(), 1, 1);
14515 __ Scvtf(z7.VnH(), p0.Merging(), z7.VnH()); // Ai + B
14516
14517 __ Sel(z2.VnH(), p3, z1.VnH(), z30.VnH()); // 5i + 0
14518 __ Sel(z3.VnH(), p2, z1.VnH(), z30.VnH()); // 0i + 5
14519 __ Sel(z7.VnH(), p3, z7.VnH(), z0.VnH()); // Ai + 10
Martyn Capewellbebdfeb2021-03-04 17:31:19 +000014520 __ Mov(z8, z7);
14521 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 2);
Martyn Capewell0b1afa82020-03-04 11:31:42 +000014522 __ Sel(z8.VnH(), p2, z8.VnH(), z30.VnH()); // 0i + A
14523
14524 // (10i + 10) + rotate(5i + 0, 90)
14525 // = (10i + 10) + (0i - 5)
14526 // = 10i + 5
14527 __ Fcadd(z4.VnH(), p0.Merging(), z0.VnH(), z2.VnH(), 90);
14528
14529 // (10i + 5) + rotate(0i + 5, 270)
14530 // = (10i + 5) + (-5i + 0)
14531 // = 5i + 5
14532 __ Fcadd(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH(), 270);
14533
14534 // The same calculation, but selecting real/imaginary using predication.
14535 __ Mov(z5, z0);
14536 __ Fcadd(z5.VnH(), p2.Merging(), z5.VnH(), z1.VnH(), 90);
14537 __ Fcadd(z5.VnH(), p3.Merging(), z5.VnH(), z1.VnH(), 270);
14538
14539 // Reference calculation: (10i + 10) - (5i + 5)
14540 __ Fsub(z6.VnH(), z0.VnH(), z1.VnH());
14541
14542 // Calculation using varying imaginary values.
14543 // (Ai + 10) + rotate(5i + 0, 90)
14544 // = (Ai + 10) + (0i - 5)
14545 // = Ai + 5
14546 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z2.VnH(), 90);
14547
14548 // (Ai + 5) + rotate(0i + A, 270)
14549 // = (Ai + 5) + (-Ai + 0)
14550 // = 5
14551 __ Fcadd(z7.VnH(), p0.Merging(), z7.VnH(), z8.VnH(), 270);
14552
14553 // Repeated, but for wider elements.
14554 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14555 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14556 __ Fdup(z0.VnS(), 42.0);
14557 __ Fdup(z1.VnS(), 21.0);
14558 __ Index(z11.VnS(), 1, 1);
14559 __ Scvtf(z11.VnS(), p0.Merging(), z11.VnS());
14560 __ Sel(z2.VnS(), p3, z1.VnS(), z30.VnS());
14561 __ Sel(z29.VnS(), p2, z1.VnS(), z30.VnS());
14562 __ Sel(z11.VnS(), p3, z11.VnS(), z0.VnS());
Martyn Capewellbebdfeb2021-03-04 17:31:19 +000014563 __ Mov(z12, z11);
14564 __ Ext(z12.VnB(), z12.VnB(), z12.VnB(), 4);
Martyn Capewell0b1afa82020-03-04 11:31:42 +000014565 __ Sel(z12.VnS(), p2, z12.VnS(), z30.VnS());
14566 __ Fcadd(z8.VnS(), p0.Merging(), z0.VnS(), z2.VnS(), 90);
14567 __ Fcadd(z8.VnS(), p0.Merging(), z8.VnS(), z29.VnS(), 270);
14568 __ Mov(z9, z0);
14569 __ Fcadd(z9.VnS(), p2.Merging(), z9.VnS(), z1.VnS(), 90);
14570 __ Fcadd(z9.VnS(), p3.Merging(), z9.VnS(), z1.VnS(), 270);
14571 __ Fsub(z10.VnS(), z0.VnS(), z1.VnS());
14572 __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z2.VnS(), 90);
14573 __ Fcadd(z11.VnS(), p0.Merging(), z11.VnS(), z12.VnS(), 270);
14574
14575 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14576 __ Zip1(p3.VnD(), p1.VnD(), p0.VnD());
14577 __ Fdup(z0.VnD(), -42.0);
14578 __ Fdup(z1.VnD(), -21.0);
14579 __ Index(z15.VnD(), 1, 1);
14580 __ Scvtf(z15.VnD(), p0.Merging(), z15.VnD());
14581 __ Sel(z2.VnD(), p3, z1.VnD(), z30.VnD());
14582 __ Sel(z28.VnD(), p2, z1.VnD(), z30.VnD());
14583 __ Sel(z15.VnD(), p3, z15.VnD(), z0.VnD());
Martyn Capewellbebdfeb2021-03-04 17:31:19 +000014584 __ Mov(z16, z15);
14585 __ Ext(z16.VnB(), z16.VnB(), z16.VnB(), 8);
Martyn Capewell0b1afa82020-03-04 11:31:42 +000014586 __ Sel(z16.VnD(), p2, z16.VnD(), z30.VnD());
14587 __ Fcadd(z12.VnD(), p0.Merging(), z0.VnD(), z2.VnD(), 90);
14588 __ Fcadd(z12.VnD(), p0.Merging(), z12.VnD(), z28.VnD(), 270);
14589 __ Mov(z13, z0);
14590 __ Fcadd(z13.VnD(), p2.Merging(), z13.VnD(), z1.VnD(), 90);
14591 __ Fcadd(z13.VnD(), p3.Merging(), z13.VnD(), z1.VnD(), 270);
14592 __ Fsub(z14.VnD(), z0.VnD(), z1.VnD());
14593 __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z2.VnD(), 90);
14594 __ Fcadd(z15.VnD(), p0.Merging(), z15.VnD(), z16.VnD(), 270);
14595 END();
14596
14597 if (CAN_RUN()) {
14598 RUN();
14599 ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14600 ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14601 ASSERT_EQUAL_SVE(z3.VnH(), z7.VnH());
14602 ASSERT_EQUAL_SVE(z10.VnS(), z8.VnS());
14603 ASSERT_EQUAL_SVE(z10.VnS(), z9.VnS());
14604 ASSERT_EQUAL_SVE(z29.VnS(), z11.VnS());
14605 ASSERT_EQUAL_SVE(z14.VnD(), z12.VnD());
14606 ASSERT_EQUAL_SVE(z14.VnD(), z13.VnD());
14607 ASSERT_EQUAL_SVE(z28.VnS(), z15.VnS());
14608 }
14609}
14610
Martyn Capewelle4886e52020-03-30 09:28:52 +010014611TEST_SVE(sve_fcmla_index) {
14612 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14613 START();
14614
14615 __ Ptrue(p0.VnB());
14616
14617 __ Fdup(z0.VnH(), 10.0);
14618 __ Fdup(z2.VnH(), 2.0);
14619 __ Zip1(z0.VnH(), z0.VnH(), z2.VnH());
14620
14621 // Duplicate complex numbers across z2 segments. First segment has 1i+0,
14622 // second has 3i+2, etc.
14623 __ Index(z1.VnH(), 0, 1);
14624 __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14625 __ Zip1(z2.VnS(), z1.VnS(), z1.VnS());
14626 __ Zip1(z2.VnS(), z2.VnS(), z2.VnS());
14627
14628 // Derive a vector from z2 where only the third element in each segment
14629 // contains a complex number, with other elements zero.
14630 __ Index(z3.VnS(), 0, 1);
14631 __ And(z3.VnS(), z3.VnS(), 3);
14632 __ Cmpeq(p2.VnS(), p0.Zeroing(), z3.VnS(), 2);
14633 __ Dup(z3.VnB(), 0);
14634 __ Sel(z3.VnS(), p2, z2.VnS(), z3.VnS());
14635
14636 // Use indexed complex multiply on this vector, indexing the third element.
14637 __ Dup(z4.VnH(), 0);
14638 __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 0);
14639 __ Fcmla(z4.VnH(), z0.VnH(), z3.VnH(), 2, 90);
14640
14641 // Rotate the indexed complex number and repeat, negated, and with a different
14642 // index.
14643 __ Ext(z3.VnH(), z3.VnH(), z3.VnH(), 4);
14644 __ Dup(z5.VnH(), 0);
14645 __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 180);
14646 __ Fcmla(z5.VnH(), z0.VnH(), z3.VnH(), 1, 270);
14647 __ Fneg(z5.VnH(), p0.Merging(), z5.VnH());
14648
14649 // Create a reference result from a vector complex multiply.
14650 __ Dup(z6.VnH(), 0);
Martyn Capewelle6100d92021-03-18 16:56:43 +000014651 __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 0);
14652 __ Fcmla(z6.VnH(), p0.Merging(), z6.VnH(), z0.VnH(), z2.VnH(), 90);
Martyn Capewelle4886e52020-03-30 09:28:52 +010014653
14654 // Repeated, but for wider elements.
14655 __ Fdup(z0.VnS(), 42.0);
14656 __ Fdup(z2.VnS(), 24.0);
14657 __ Zip1(z0.VnS(), z0.VnS(), z2.VnS());
14658 __ Index(z1.VnS(), -42, 13);
14659 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14660 __ Zip1(z2.VnD(), z1.VnD(), z1.VnD());
14661 __ Zip1(z2.VnD(), z2.VnD(), z2.VnD());
14662 __ Index(z3.VnD(), 0, 1);
14663 __ And(z3.VnD(), z3.VnD(), 1);
14664 __ Cmpeq(p2.VnD(), p0.Zeroing(), z3.VnD(), 1);
14665 __ Dup(z3.VnB(), 0);
14666 __ Sel(z3.VnD(), p2, z2.VnD(), z3.VnD());
14667 __ Dup(z7.VnS(), 0);
14668 __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 0);
14669 __ Fcmla(z7.VnS(), z0.VnS(), z3.VnS(), 1, 90);
14670 __ Ext(z3.VnB(), z3.VnB(), z3.VnB(), 8);
14671 __ Dup(z8.VnS(), 0);
14672 __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 180);
14673 __ Fcmla(z8.VnS(), z0.VnS(), z3.VnS(), 0, 270);
14674 __ Fneg(z8.VnS(), p0.Merging(), z8.VnS());
14675 __ Dup(z9.VnS(), 0);
Martyn Capewelle6100d92021-03-18 16:56:43 +000014676 __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 0);
14677 __ Fcmla(z9.VnS(), p0.Merging(), z9.VnS(), z0.VnS(), z2.VnS(), 90);
Martyn Capewelle4886e52020-03-30 09:28:52 +010014678 END();
14679
14680 if (CAN_RUN()) {
14681 RUN();
14682 ASSERT_EQUAL_SVE(z6.VnH(), z4.VnH());
14683 ASSERT_EQUAL_SVE(z6.VnH(), z5.VnH());
14684 ASSERT_EQUAL_SVE(z9.VnS(), z7.VnS());
14685 ASSERT_EQUAL_SVE(z9.VnS(), z8.VnS());
14686 }
14687}
14688
Martyn Capewell75f1c432020-03-30 09:23:27 +010014689TEST_SVE(sve_fcmla) {
14690 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14691 START();
14692
14693 __ Ptrue(p0.VnB());
14694 __ Pfalse(p1.VnB());
14695 __ Zip1(p2.VnH(), p0.VnH(), p1.VnH()); // Real elements.
14696 __ Zip1(p3.VnH(), p1.VnH(), p0.VnH()); // Imaginary elements.
14697
14698 __ Fdup(z0.VnH(), 10.0);
14699 __ Fdup(z2.VnH(), 2.0);
14700
14701 // Create pairs of complex numbers, Ai + A. A is chosen to be non-zero, as
14702 // the later fneg will result in a failed comparison otherwise.
14703 __ Index(z1.VnH(), -4, 3);
14704 __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14705 __ Zip1(z1.VnH(), z1.VnH(), z1.VnH());
14706 __ Scvtf(z1.VnH(), p0.Merging(), z1.VnH());
14707
14708 __ Sel(z3.VnH(), p2, z0.VnH(), z1.VnH()); // Ai + 10
14709 __ Sel(z4.VnH(), p2, z1.VnH(), z2.VnH()); // 2i + A
14710
14711 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS()); // Even complex numbers.
14712 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS()); // Odd complex numbers.
14713
14714 // Calculate (Ai + 10) * (2i + A) = (20 + A^2)i + 8A, using predication to
14715 // select only the complex numbers in odd-numbered element pairs. This leaves
14716 // results in elements 2/3, 6/7, etc. with zero in elements 0/1, 4/5, etc.
14717 // ... 7 6 5 4 3 2 1 0 <-- element
14718 // ... | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | 0 | 0 | <-- value
14719 __ Dup(z5.VnH(), 0);
Martyn Capewelle6100d92021-03-18 16:56:43 +000014720 __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 0);
14721 __ Fcmla(z5.VnH(), p3.Merging(), z5.VnH(), z4.VnH(), z3.VnH(), 90);
Martyn Capewell75f1c432020-03-30 09:23:27 +010014722
14723 // Move the odd results to the even result positions.
14724 // ... 7 6 5 4 3 2 1 0 <-- element
14725 // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
14726 __ Ext(z5.VnB(), z5.VnB(), z5.VnB(), 4);
14727
14728 // Calculate -(Ai + 10) * (2i + A) = -(20 + A^2)i - 8A for the even complex
14729 // numbers.
14730 // ... 7 6 5 4 3 2 1 0 <-- element
14731 // ... | 0 | 0 | -20-A^2 | -8A | 0 | 0 | -20-A^2 | -8A | <-- value
14732 __ Dup(z6.VnH(), 0);
Martyn Capewelle6100d92021-03-18 16:56:43 +000014733 __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 180);
14734 __ Fcmla(z6.VnH(), p2.Merging(), z6.VnH(), z4.VnH(), z3.VnH(), 270);
Martyn Capewell75f1c432020-03-30 09:23:27 +010014735
14736 // Negate the even results. The results in z6 should now match the results
14737 // computed earlier in z5.
14738 // ... 7 6 5 4 3 2 1 0 <-- element
14739 // ... | 0 | 0 | 20+A^2 | 8A | 0 | 0 | 20+A^2 | 8A | <-- value
14740 __ Fneg(z6.VnH(), p2.Merging(), z6.VnH());
14741
14742
14743 // Similarly, but for wider elements.
14744 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
14745 __ Zip1(p3.VnS(), p1.VnS(), p0.VnS());
14746 __ Index(z1.VnS(), -4, 3);
14747 __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14748 __ Zip1(z1.VnS(), z1.VnS(), z1.VnS());
14749 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
14750 __ Fdup(z0.VnS(), 20.0);
14751 __ Fdup(z2.VnS(), 21.0);
14752 __ Sel(z3.VnS(), p2, z0.VnS(), z1.VnS());
14753 __ Sel(z4.VnS(), p2, z1.VnS(), z2.VnS());
14754 __ Punpklo(p2.VnH(), p2.VnB());
14755 __ Punpklo(p3.VnH(), p3.VnB());
14756 __ Dup(z7.VnS(), 0);
Martyn Capewelle6100d92021-03-18 16:56:43 +000014757 __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 0);
14758 __ Fcmla(z7.VnS(), p3.Merging(), z7.VnS(), z4.VnS(), z3.VnS(), 90);
Martyn Capewell75f1c432020-03-30 09:23:27 +010014759 __ Ext(z7.VnB(), z7.VnB(), z7.VnB(), 8);
14760 __ Dup(z8.VnS(), 0);
Martyn Capewelle6100d92021-03-18 16:56:43 +000014761 __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 180);
14762 __ Fcmla(z8.VnS(), p2.Merging(), z8.VnS(), z4.VnS(), z3.VnS(), 270);
Martyn Capewell75f1c432020-03-30 09:23:27 +010014763 __ Fneg(z8.VnS(), p2.Merging(), z8.VnS());
14764
14765 // Double precision computed for even lanes only.
14766 __ Zip1(p2.VnD(), p0.VnD(), p1.VnD());
14767 __ Index(z1.VnD(), -4, 3);
14768 __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14769 __ Zip1(z1.VnD(), z1.VnD(), z1.VnD());
14770 __ Scvtf(z1.VnD(), p0.Merging(), z1.VnD());
14771 __ Fdup(z0.VnD(), 20.0);
14772 __ Fdup(z2.VnD(), 21.0);
14773 __ Sel(z3.VnD(), p2, z0.VnD(), z1.VnD());
14774 __ Sel(z4.VnD(), p2, z1.VnD(), z2.VnD());
14775 __ Punpklo(p2.VnH(), p2.VnB());
14776 __ Dup(z9.VnD(), 0);
Martyn Capewelle6100d92021-03-18 16:56:43 +000014777 __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 0);
14778 __ Fcmla(z9.VnD(), p2.Merging(), z9.VnD(), z4.VnD(), z3.VnD(), 90);
Martyn Capewell75f1c432020-03-30 09:23:27 +010014779 __ Dup(z10.VnD(), 0);
Martyn Capewelle6100d92021-03-18 16:56:43 +000014780 __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 180);
14781 __ Fcmla(z10.VnD(), p2.Merging(), z10.VnD(), z4.VnD(), z3.VnD(), 270);
Martyn Capewell75f1c432020-03-30 09:23:27 +010014782 __ Fneg(z10.VnD(), p2.Merging(), z10.VnD());
14783 END();
14784
14785 if (CAN_RUN()) {
14786 RUN();
14787 ASSERT_EQUAL_SVE(z5.VnH(), z6.VnH());
14788 ASSERT_EQUAL_SVE(z7.VnS(), z8.VnS());
14789 ASSERT_EQUAL_SVE(z9.VnD(), z10.VnD());
14790 }
14791}
14792
Martyn Capewell46352612020-07-02 15:47:54 +010014793// Create a pattern in dst where the value of each element in src is incremented
14794// by the segment number. This allows varying a short input by a predictable
14795// pattern for each segment.
14796static void FPSegmentPatternHelper(MacroAssembler* masm,
14797 const ZRegister& dst,
14798 const PRegisterM& ptrue,
14799 const ZRegister& src) {
14800 VIXL_ASSERT(AreSameLaneSize(dst, src));
14801 UseScratchRegisterScope temps(masm);
14802 ZRegister ztmp = temps.AcquireZ().WithSameLaneSizeAs(dst);
14803 masm->Index(ztmp, 0, 1);
14804 masm->Asr(ztmp, ztmp, kQRegSizeInBytesLog2 - dst.GetLaneSizeInBytesLog2());
14805 masm->Scvtf(ztmp, ptrue, ztmp);
14806 masm->Fadd(dst, src, ztmp);
14807}
14808
Martyn Capewell50e9f552020-01-07 17:45:03 +000014809TEST_SVE(sve_fpmul_index) {
14810 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14811 START();
14812
14813 uint64_t in0[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
14814 uint64_t in1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
14815
Martyn Capewell46352612020-07-02 15:47:54 +010014816 __ Ptrue(p0.VnB());
14817 // Repeat indexed vector across up to 2048-bit VL.
14818 for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i++) {
14819 InsrHelper(&masm, z25.VnD(), in0);
14820 }
Martyn Capewell50e9f552020-01-07 17:45:03 +000014821 InsrHelper(&masm, z1.VnD(), in1);
14822
Martyn Capewell46352612020-07-02 15:47:54 +010014823 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z25.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014824 __ Fmul(z2.VnH(), z1.VnH(), z0.VnH(), 0);
14825 __ Fmul(z3.VnH(), z1.VnH(), z0.VnH(), 1);
14826 __ Fmul(z4.VnH(), z1.VnH(), z0.VnH(), 4);
14827 __ Fmul(z5.VnH(), z1.VnH(), z0.VnH(), 7);
14828
14829 __ Fmul(z6.VnS(), z1.VnS(), z0.VnS(), 0);
14830 __ Fmul(z7.VnS(), z1.VnS(), z0.VnS(), 1);
14831 __ Fmul(z8.VnS(), z1.VnS(), z0.VnS(), 2);
14832 __ Fmul(z9.VnS(), z1.VnS(), z0.VnS(), 3);
14833
14834 __ Fmul(z10.VnD(), z1.VnD(), z0.VnD(), 0);
14835 __ Fmul(z11.VnD(), z1.VnD(), z0.VnD(), 1);
14836
14837 // Compute the results using other instructions.
Martyn Capewell46352612020-07-02 15:47:54 +010014838 __ Dup(z12.VnH(), z25.VnH(), 0);
14839 FPSegmentPatternHelper(&masm, z12.VnH(), p0.Merging(), z12.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014840 __ Fmul(z12.VnH(), z1.VnH(), z12.VnH());
Martyn Capewell46352612020-07-02 15:47:54 +010014841 __ Dup(z13.VnH(), z25.VnH(), 1);
14842 FPSegmentPatternHelper(&masm, z13.VnH(), p0.Merging(), z13.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014843 __ Fmul(z13.VnH(), z1.VnH(), z13.VnH());
Martyn Capewell46352612020-07-02 15:47:54 +010014844 __ Dup(z14.VnH(), z25.VnH(), 4);
14845 FPSegmentPatternHelper(&masm, z14.VnH(), p0.Merging(), z14.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014846 __ Fmul(z14.VnH(), z1.VnH(), z14.VnH());
Martyn Capewell46352612020-07-02 15:47:54 +010014847 __ Dup(z15.VnH(), z25.VnH(), 7);
14848 FPSegmentPatternHelper(&masm, z15.VnH(), p0.Merging(), z15.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014849 __ Fmul(z15.VnH(), z1.VnH(), z15.VnH());
14850
Martyn Capewell46352612020-07-02 15:47:54 +010014851 __ Dup(z16.VnS(), z25.VnS(), 0);
14852 FPSegmentPatternHelper(&masm, z16.VnH(), p0.Merging(), z16.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014853 __ Fmul(z16.VnS(), z1.VnS(), z16.VnS());
Martyn Capewell46352612020-07-02 15:47:54 +010014854 __ Dup(z17.VnS(), z25.VnS(), 1);
14855 FPSegmentPatternHelper(&masm, z17.VnH(), p0.Merging(), z17.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014856 __ Fmul(z17.VnS(), z1.VnS(), z17.VnS());
Martyn Capewell46352612020-07-02 15:47:54 +010014857 __ Dup(z18.VnS(), z25.VnS(), 2);
14858 FPSegmentPatternHelper(&masm, z18.VnH(), p0.Merging(), z18.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014859 __ Fmul(z18.VnS(), z1.VnS(), z18.VnS());
Martyn Capewell46352612020-07-02 15:47:54 +010014860 __ Dup(z19.VnS(), z25.VnS(), 3);
14861 FPSegmentPatternHelper(&masm, z19.VnH(), p0.Merging(), z19.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014862 __ Fmul(z19.VnS(), z1.VnS(), z19.VnS());
14863
Martyn Capewell46352612020-07-02 15:47:54 +010014864 __ Dup(z20.VnD(), z25.VnD(), 0);
14865 FPSegmentPatternHelper(&masm, z20.VnH(), p0.Merging(), z20.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014866 __ Fmul(z20.VnD(), z1.VnD(), z20.VnD());
Martyn Capewell46352612020-07-02 15:47:54 +010014867 __ Dup(z21.VnD(), z25.VnD(), 1);
14868 FPSegmentPatternHelper(&masm, z21.VnH(), p0.Merging(), z21.VnH());
Martyn Capewell50e9f552020-01-07 17:45:03 +000014869 __ Fmul(z21.VnD(), z1.VnD(), z21.VnD());
14870
14871 END();
14872
14873 if (CAN_RUN()) {
14874 RUN();
14875 ASSERT_EQUAL_SVE(z12.VnH(), z2.VnH());
14876 ASSERT_EQUAL_SVE(z13.VnH(), z3.VnH());
14877 ASSERT_EQUAL_SVE(z14.VnH(), z4.VnH());
14878 ASSERT_EQUAL_SVE(z15.VnH(), z5.VnH());
14879 ASSERT_EQUAL_SVE(z16.VnS(), z6.VnS());
14880 ASSERT_EQUAL_SVE(z17.VnS(), z7.VnS());
14881 ASSERT_EQUAL_SVE(z18.VnS(), z8.VnS());
14882 ASSERT_EQUAL_SVE(z19.VnS(), z9.VnS());
14883 ASSERT_EQUAL_SVE(z20.VnD(), z10.VnD());
14884 ASSERT_EQUAL_SVE(z21.VnD(), z11.VnD());
14885 }
14886}
14887
Martyn Capewell5fb2ad62020-01-10 14:08:27 +000014888TEST_SVE(sve_ftmad) {
14889 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
14890 START();
14891
14892 uint64_t in_h0[] = {0x7c027e01fc02fe01,
14893 0x3c003c00bc00bc00,
14894 0x3c003c00bc00bc00};
14895 uint64_t in_h1[] = {0xfe01fc027e017e01,
14896 0x3c00bc003c00bc00,
14897 0x3c00bc003c00bc00};
14898 uint64_t in_s0[] = {0x7f800002ffc00001,
14899 0x3f8000003f800000,
14900 0xbf800000bf800000};
14901 uint64_t in_s1[] = {0xffc00001ffc00001,
14902 0x3f800000bf800000,
14903 0x3f800000bf800000};
14904 uint64_t in_d0[] = {0x7ff8000000000001,
14905 0x3ff0000000000000,
14906 0xbff0000000000000};
14907 uint64_t in_d1[] = {0xfff0000000000002,
14908 0xbff0000000000000,
14909 0x3ff0000000000000};
14910 InsrHelper(&masm, z0.VnD(), in_h0);
14911 InsrHelper(&masm, z1.VnD(), in_h1);
14912 InsrHelper(&masm, z2.VnD(), in_s0);
14913 InsrHelper(&masm, z3.VnD(), in_s1);
14914 InsrHelper(&masm, z4.VnD(), in_d0);
14915 InsrHelper(&masm, z5.VnD(), in_d1);
14916
14917 __ Mov(z6, z0);
14918 __ Ftmad(z6.VnH(), z6.VnH(), z1.VnH(), 0);
14919 __ Mov(z7, z0);
14920 __ Ftmad(z7.VnH(), z7.VnH(), z1.VnH(), 1);
14921 __ Mov(z8, z0);
14922 __ Ftmad(z8.VnH(), z8.VnH(), z1.VnH(), 2);
14923
14924 __ Mov(z9, z2);
14925 __ Ftmad(z9.VnS(), z9.VnS(), z3.VnS(), 0);
14926 __ Mov(z10, z2);
14927 __ Ftmad(z10.VnS(), z10.VnS(), z3.VnS(), 3);
14928 __ Mov(z11, z2);
14929 __ Ftmad(z11.VnS(), z11.VnS(), z3.VnS(), 4);
14930
14931 __ Mov(z12, z4);
14932 __ Ftmad(z12.VnD(), z12.VnD(), z5.VnD(), 0);
14933 __ Mov(z13, z4);
14934 __ Ftmad(z13.VnD(), z13.VnD(), z5.VnD(), 5);
14935 __ Mov(z14, z4);
14936 __ Ftmad(z14.VnD(), z14.VnD(), z5.VnD(), 7);
14937
14938 END();
14939
14940 if (CAN_RUN()) {
14941 RUN();
14942 uint64_t expected_z6[] = {0x7e027e02fe02fe01,
14943 0x4000400000000000,
14944 0x4000400000000000};
14945 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
14946 uint64_t expected_z7[] = {0x7e027e02fe02fe01,
14947 0x3aab3800bcabbe00,
14948 0x3aab3800bcabbe00};
14949 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
14950 uint64_t expected_z8[] = {0x7e027e02fe02fe01,
14951 0x3c083c2abbefbbac,
14952 0x3c083c2abbefbbac};
14953 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
14954 uint64_t expected_z9[] = {0x7fc00002ffc00001,
14955 0x4000000040000000,
14956 0x0000000000000000};
14957 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
14958 uint64_t expected_z10[] = {0x7fc00002ffc00001,
14959 0x3f7ff2ff3f7fa4fc,
14960 0xbf800680bf802d82};
14961 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
14962 uint64_t expected_z11[] = {0x7fc00002ffc00001,
14963 0x3f8000173f8000cd,
14964 0xbf7fffd2bf7ffe66};
14965 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
14966 uint64_t expected_z12[] = {0x7ff8000000000002,
14967 0x4000000000000000,
14968 0x0000000000000000};
14969 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
14970 uint64_t expected_z13[] = {0x7ff8000000000002,
14971 0x3fefffff6c0d846c,
14972 0xbff0000006b978ae};
14973 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
14974 uint64_t expected_z14[] = {0x7ff8000000000002,
14975 0x3feffffffffe708a,
14976 0xbff0000000000000};
14977 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
14978 }
14979}
14980
Martyn Capewell37f28182020-01-14 10:15:10 +000014981static void BasicFPArithHelper(MacroAssembler* masm,
14982 int lane_size_in_bits,
14983 const uint64_t (&inputs)[2],
14984 const uint64_t (&inputs_fmulx)[2],
14985 const uint64_t (&inputs_nans)[2]) {
14986 int ls = lane_size_in_bits;
14987
14988 for (int i = 0; i < 16; i++) {
14989 InsrHelper(masm, z0.VnD(), inputs);
14990 }
14991 ZRegister rvrs = z1.WithLaneSize(ls);
14992 masm->Rev(rvrs, z0.WithLaneSize(ls));
14993
14994 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
14995 Initialise(masm, p2.VnB(), pred);
14996 PRegisterM p2m = p2.Merging();
14997
14998 masm->Mov(z2, z0);
14999 masm->Fadd(z2.WithLaneSize(ls),
15000 p2m,
15001 z2.WithLaneSize(ls),
15002 rvrs,
15003 FastNaNPropagation);
15004 masm->Mov(z3, z0);
15005 masm->Fsub(z3.WithLaneSize(ls), p2m, z3.WithLaneSize(ls), rvrs);
15006 masm->Mov(z4, z0);
15007 masm->Fsub(z4.WithLaneSize(ls), p2m, rvrs, z4.WithLaneSize(ls));
15008 masm->Mov(z5, z0);
15009 masm->Fabd(z5.WithLaneSize(ls),
15010 p2m,
15011 z5.WithLaneSize(ls),
15012 rvrs,
15013 FastNaNPropagation);
15014 masm->Mov(z6, z0);
15015 masm->Fmul(z6.WithLaneSize(ls),
15016 p2m,
15017 z6.WithLaneSize(ls),
15018 rvrs,
15019 FastNaNPropagation);
15020
15021 for (int i = 0; i < 16; i++) {
15022 InsrHelper(masm, z7.VnD(), inputs_fmulx);
15023 }
15024 masm->Rev(z8.WithLaneSize(ls), z7.WithLaneSize(ls));
15025 masm->Fmulx(z7.WithLaneSize(ls),
15026 p2m,
15027 z7.WithLaneSize(ls),
15028 z8.WithLaneSize(ls),
15029 FastNaNPropagation);
15030
15031 InsrHelper(masm, z8.VnD(), inputs_nans);
15032 masm->Mov(z9, z8);
15033 masm->Fminnm(z9.WithLaneSize(ls),
15034 p2m,
15035 z9.WithLaneSize(ls),
15036 rvrs,
15037 FastNaNPropagation);
15038 masm->Mov(z10, z8);
15039 masm->Fmaxnm(z10.WithLaneSize(ls),
15040 p2m,
15041 z10.WithLaneSize(ls),
15042 rvrs,
15043 FastNaNPropagation);
15044}
15045
15046TEST_SVE(sve_fp_arith_pred_h) {
15047 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15048 START();
15049
15050 uint64_t inputs[] = {0x4800470046004500, 0x4400420040003c00};
15051 uint64_t inputs_fmulx[] = {0x7c00fc007c00fc00, 0x0000800000008000};
15052 uint64_t inputs_nans[] = {0x7fffffff7fffffff, 0x7bfffbff7fbbfbff};
15053
15054 BasicFPArithHelper(&masm, kHRegSize, inputs, inputs_fmulx, inputs_nans);
15055
15056 END();
15057
15058 if (CAN_RUN()) {
15059 RUN();
15060 uint64_t expected_z2[] = {0x4880488048804880, 0x4880420048804880};
15061 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15062 uint64_t expected_z3[] = {0x4700450042003c00, 0xbc004200c500c700};
15063 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15064 uint64_t expected_z4[] = {0xc700c500c200bc00, 0x3c00420045004700};
15065 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15066 uint64_t expected_z5[] = {0x4700450042003c00, 0x3c00420045004700};
15067 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15068 uint64_t expected_z6[] = {0x48004b004c804d00, 0x4d0042004b004800};
15069 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15070 uint64_t expected_z7[] = {0xc000c000c000c000, 0xc0008000c000c000};
15071 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15072 uint64_t expected_z9[] = {0x3c00400042004400, 0x4500fbff4700fbff};
15073 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15074 uint64_t expected_z10[] = {0x3c00400042004400, 0x7bfffbff47004800};
15075 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15076 }
15077}
15078
15079TEST_SVE(sve_fp_arith_pred_s) {
15080 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15081 START();
15082
15083 uint64_t inputs[] = {0x4080000040400000, 0x400000003f800000};
15084 uint64_t inputs_fmulx[] = {0x7f800000ff800000, 0x0000000080000000};
15085 uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x41000000c1000000};
15086
15087 BasicFPArithHelper(&masm, kSRegSize, inputs, inputs_fmulx, inputs_nans);
15088
15089 END();
15090
15091 if (CAN_RUN()) {
15092 RUN();
15093 uint64_t expected_z2[] = {0x40a0000040a00000, 0x4000000040a00000};
15094 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15095 uint64_t expected_z3[] = {0x404000003f800000, 0x40000000c0400000};
15096 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15097 uint64_t expected_z4[] = {0xc0400000bf800000, 0x4000000040400000};
15098 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15099 uint64_t expected_z5[] = {0x404000003f800000, 0x4000000040400000};
15100 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15101 uint64_t expected_z6[] = {0x4080000040c00000, 0x4000000040800000};
15102 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15103 uint64_t expected_z7[] = {0xc0000000c0000000, 0x00000000c0000000};
15104 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15105 uint64_t expected_z9[] = {0x3f80000040000000, 0x41000000c1000000};
15106 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15107 uint64_t expected_z10[] = {0x3f80000040000000, 0x4100000040800000};
15108 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15109 }
15110}
15111
15112TEST_SVE(sve_fp_arith_pred_d) {
15113 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15114 START();
15115
15116 uint64_t inputs[] = {0x4000000000000000, 0x3ff0000000000000};
15117 uint64_t inputs_fmulx[] = {0x7ff0000000000000, 0x8000000000000000};
15118 uint64_t inputs_nans[] = {0x7fffffffffffffff, 0x4100000000000000};
15119
15120 BasicFPArithHelper(&masm, kDRegSize, inputs, inputs_fmulx, inputs_nans);
15121
15122 END();
15123
15124 if (CAN_RUN()) {
15125 RUN();
15126 uint64_t expected_z2[] = {0x4008000000000000, 0x4008000000000000};
15127 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15128 uint64_t expected_z3[] = {0x3ff0000000000000, 0xbff0000000000000};
15129 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15130 uint64_t expected_z4[] = {0xbff0000000000000, 0x3ff0000000000000};
15131 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15132 uint64_t expected_z5[] = {0x3ff0000000000000, 0x3ff0000000000000};
15133 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15134 uint64_t expected_z6[] = {0x4000000000000000, 0x4000000000000000};
15135 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15136 uint64_t expected_z7[] = {0xc000000000000000, 0xc000000000000000};
15137 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15138 uint64_t expected_z9[] = {0x3ff0000000000000, 0x4000000000000000};
15139 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15140 uint64_t expected_z10[] = {0x3ff0000000000000, 0x4100000000000000};
15141 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
15142 }
15143}
15144
Martyn Capewella2fadc22020-01-16 16:09:55 +000015145TEST_SVE(sve_fp_arith_pred_imm) {
15146 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15147 START();
15148
15149 int pred[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1};
15150 Initialise(&masm, p0.VnB(), pred);
15151 PRegisterM p0m = p0.Merging();
15152 __ Ptrue(p1.VnB());
15153
15154 __ Fdup(z0.VnD(), 0.0);
15155
15156 __ Mov(z1, z0);
15157 __ Fdiv(z1.VnH(), p1.Merging(), z1.VnH(), z1.VnH());
15158 __ Mov(z2, z0);
15159 __ Fadd(z2.VnH(), p0m, z2.VnH(), 0.5);
15160 __ Mov(z3, z2);
15161 __ Fsub(z3.VnH(), p0m, z3.VnH(), 1.0);
15162 __ Mov(z4, z3);
15163 __ Fsub(z4.VnH(), p0m, 1.0, z4.VnH());
15164 __ Mov(z5, z4);
15165 __ Fmul(z5.VnH(), p0m, z5.VnH(), 2.0);
15166 __ Mov(z6, z1);
15167 __ Fminnm(z6.VnH(), p0m, z6.VnH(), 0.0);
15168 __ Mov(z7, z1);
15169 __ Fmaxnm(z7.VnH(), p0m, z7.VnH(), 1.0);
15170 __ Mov(z8, z5);
15171 __ Fmin(z8.VnH(), p0m, z8.VnH(), 1.0);
15172 __ Mov(z9, z5);
15173 __ Fmax(z9.VnH(), p0m, z9.VnH(), 0.0);
15174
15175 __ Mov(z11, z0);
15176 __ Fdiv(z11.VnS(), p1.Merging(), z11.VnS(), z11.VnS());
15177 __ Mov(z12, z0);
15178 __ Fadd(z12.VnS(), p0m, z12.VnS(), 0.5);
15179 __ Mov(z13, z12);
15180 __ Fsub(z13.VnS(), p0m, z13.VnS(), 1.0);
15181 __ Mov(z14, z13);
15182 __ Fsub(z14.VnS(), p0m, 1.0, z14.VnS());
15183 __ Mov(z15, z14);
15184 __ Fmul(z15.VnS(), p0m, z15.VnS(), 2.0);
15185 __ Mov(z16, z11);
15186 __ Fminnm(z16.VnS(), p0m, z16.VnS(), 0.0);
15187 __ Mov(z17, z11);
15188 __ Fmaxnm(z17.VnS(), p0m, z17.VnS(), 1.0);
15189 __ Mov(z18, z15);
15190 __ Fmin(z18.VnS(), p0m, z18.VnS(), 1.0);
15191 __ Mov(z19, z15);
15192 __ Fmax(z19.VnS(), p0m, z19.VnS(), 0.0);
15193
15194 __ Mov(z21, z0);
15195 __ Fdiv(z21.VnD(), p1.Merging(), z21.VnD(), z21.VnD());
15196 __ Mov(z22, z0);
15197 __ Fadd(z22.VnD(), p0m, z22.VnD(), 0.5);
15198 __ Mov(z23, z22);
15199 __ Fsub(z23.VnD(), p0m, z23.VnD(), 1.0);
15200 __ Mov(z24, z23);
15201 __ Fsub(z24.VnD(), p0m, 1.0, z24.VnD());
15202 __ Mov(z25, z24);
15203 __ Fmul(z25.VnD(), p0m, z25.VnD(), 2.0);
15204 __ Mov(z26, z21);
15205 __ Fminnm(z26.VnD(), p0m, z26.VnD(), 0.0);
15206 __ Mov(z27, z21);
15207 __ Fmaxnm(z27.VnD(), p0m, z27.VnD(), 1.0);
15208 __ Mov(z28, z25);
15209 __ Fmin(z28.VnD(), p0m, z28.VnD(), 1.0);
15210 __ Mov(z29, z25);
15211 __ Fmax(z29.VnD(), p0m, z29.VnD(), 0.0);
15212
15213 __ Index(z0.VnH(), -3, 1);
15214 __ Scvtf(z0.VnH(), p1.Merging(), z0.VnH());
15215 __ Fmax(z0.VnH(), p1.Merging(), z0.VnH(), 0.0);
15216 __ Index(z1.VnS(), -4, 2);
15217 __ Scvtf(z1.VnS(), p1.Merging(), z1.VnS());
15218 __ Fadd(z1.VnS(), p1.Merging(), z1.VnS(), 1.0);
15219
15220 END();
15221
15222 if (CAN_RUN()) {
15223 RUN();
15224 uint64_t expected_z2[] = {0x3800380038003800, 0x3800000038003800};
15225 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
15226 uint64_t expected_z3[] = {0xb800b800b800b800, 0xb8000000b800b800};
15227 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
15228 uint64_t expected_z4[] = {0x3e003e003e003e00, 0x3e0000003e003e00};
15229 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15230 uint64_t expected_z5[] = {0x4200420042004200, 0x4200000042004200};
15231 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15232 uint64_t expected_z6[] = {0x0000000000000000, 0x00007e0000000000};
15233 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15234 uint64_t expected_z7[] = {0x3c003c003c003c00, 0x3c007e003c003c00};
15235 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15236 uint64_t expected_z8[] = {0x3c003c003c003c00, 0x3c0000003c003c00};
15237 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15238 uint64_t expected_z9[] = {0x4200420042004200, 0x4200000042004200};
15239 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15240
15241 uint64_t expected_z12[] = {0x3f0000003f000000, 0x000000003f000000};
15242 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
15243 uint64_t expected_z13[] = {0xbf000000bf000000, 0x00000000bf000000};
15244 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
15245 uint64_t expected_z14[] = {0x3fc000003fc00000, 0x000000003fc00000};
15246 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
15247 uint64_t expected_z15[] = {0x4040000040400000, 0x0000000040400000};
15248 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
15249 uint64_t expected_z16[] = {0x0000000000000000, 0x7fc0000000000000};
15250 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
15251 uint64_t expected_z17[] = {0x3f8000003f800000, 0x7fc000003f800000};
15252 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
15253 uint64_t expected_z18[] = {0x3f8000003f800000, 0x000000003f800000};
15254 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
15255 uint64_t expected_z19[] = {0x4040000040400000, 0x0000000040400000};
15256 ASSERT_EQUAL_SVE(expected_z19, z19.VnD());
15257
15258 uint64_t expected_z22[] = {0x3fe0000000000000, 0x3fe0000000000000};
15259 ASSERT_EQUAL_SVE(expected_z22, z22.VnD());
15260 uint64_t expected_z23[] = {0xbfe0000000000000, 0xbfe0000000000000};
15261 ASSERT_EQUAL_SVE(expected_z23, z23.VnD());
15262 uint64_t expected_z24[] = {0x3ff8000000000000, 0x3ff8000000000000};
15263 ASSERT_EQUAL_SVE(expected_z24, z24.VnD());
15264 uint64_t expected_z25[] = {0x4008000000000000, 0x4008000000000000};
15265 ASSERT_EQUAL_SVE(expected_z25, z25.VnD());
15266 uint64_t expected_z26[] = {0x0000000000000000, 0x0000000000000000};
15267 ASSERT_EQUAL_SVE(expected_z26, z26.VnD());
15268 uint64_t expected_z27[] = {0x3ff0000000000000, 0x3ff0000000000000};
15269 ASSERT_EQUAL_SVE(expected_z27, z27.VnD());
15270 uint64_t expected_z28[] = {0x3ff0000000000000, 0x3ff0000000000000};
15271 ASSERT_EQUAL_SVE(expected_z28, z28.VnD());
15272 uint64_t expected_z29[] = {0x4008000000000000, 0x4008000000000000};
15273 ASSERT_EQUAL_SVE(expected_z29, z29.VnD());
15274 uint64_t expected_z0[] = {0x4400420040003c00, 0x0000000000000000};
15275 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
15276 uint64_t expected_z1[] = {0x404000003f800000, 0xbf800000c0400000};
15277 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
15278 }
15279}
15280
Martyn Capewell37f28182020-01-14 10:15:10 +000015281TEST_SVE(sve_fscale) {
15282 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15283 START();
15284
15285 uint64_t inputs_h[] = {0x4800470046004500, 0x4400420040003c00};
15286 InsrHelper(&masm, z0.VnD(), inputs_h);
15287 uint64_t inputs_s[] = {0x4080000040400000, 0x400000003f800000};
15288 InsrHelper(&masm, z1.VnD(), inputs_s);
15289 uint64_t inputs_d[] = {0x40f0000000000000, 0x4000000000000000};
15290 InsrHelper(&masm, z2.VnD(), inputs_d);
15291
15292 uint64_t scales[] = {0x00080002fff8fffe, 0x00100001fff0ffff};
15293 InsrHelper(&masm, z3.VnD(), scales);
15294
15295 __ Ptrue(p0.VnB());
15296 int pred[] = {0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1};
15297 Initialise(&masm, p1.VnB(), pred);
15298
15299 __ Mov(z4, z0);
15300 __ Fscale(z4.VnH(), p0.Merging(), z4.VnH(), z3.VnH());
15301 __ Mov(z5, z0);
15302 __ Fscale(z5.VnH(), p1.Merging(), z5.VnH(), z3.VnH());
15303
15304 __ Sunpklo(z3.VnS(), z3.VnH());
15305 __ Mov(z6, z1);
15306 __ Fscale(z6.VnS(), p0.Merging(), z6.VnS(), z3.VnS());
15307 __ Mov(z7, z1);
15308 __ Fscale(z7.VnS(), p1.Merging(), z7.VnS(), z3.VnS());
15309
15310 __ Sunpklo(z3.VnD(), z3.VnS());
15311 __ Mov(z8, z2);
15312 __ Fscale(z8.VnD(), p0.Merging(), z8.VnD(), z3.VnD());
15313 __ Mov(z9, z2);
15314 __ Fscale(z9.VnD(), p1.Merging(), z9.VnD(), z3.VnD());
15315
15316 // Test full double precision range scaling.
15317 __ Dup(z10.VnD(), 2045);
15318 __ Dup(z11.VnD(), 0x0010000000000000); // 2^-1022
15319 __ Fscale(z11.VnD(), p0.Merging(), z11.VnD(), z10.VnD());
15320
15321 END();
15322
15323 if (CAN_RUN()) {
15324 RUN();
15325
15326 uint64_t expected_z4[] = {0x68004f0026003d00, 0x7c00460002003800};
15327 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
15328 uint64_t expected_z5[] = {0x68004f0026004500, 0x7c00420002003800};
15329 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
15330
15331 uint64_t expected_z6[] = {0x4880000040c00000, 0x380000003f000000};
15332 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
15333 uint64_t expected_z7[] = {0x4880000040400000, 0x400000003f000000};
15334 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
15335
15336 uint64_t expected_z8[] = {0x3ff0000000000000, 0x3ff0000000000000};
15337 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
15338 uint64_t expected_z9[] = {0x40f0000000000000, 0x3ff0000000000000};
15339 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
15340
15341 uint64_t expected_z11[] = {0x7fe0000000000000, 0x7fe0000000000000};
15342 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
15343 }
15344}
15345
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015346typedef void (MacroAssembler::*FcvtFrintMFn)(const ZRegister& zd,
15347 const PRegisterM& pg,
15348 const ZRegister& zn);
15349
15350typedef void (MacroAssembler::*FcvtFrintZFn)(const ZRegister& zd,
15351 const PRegisterZ& pg,
15352 const ZRegister& zn);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015353
15354template <typename F, size_t N>
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015355static void TestFcvtFrintHelper(Test* config,
15356 FcvtFrintMFn macro_m,
15357 FcvtFrintZFn macro_z,
15358 int dst_type_size_in_bits,
15359 int src_type_size_in_bits,
15360 const F (&zn_inputs)[N],
15361 const int (&pg_inputs)[N],
15362 const uint64_t (&zd_expected_all_active)[N]) {
15363 VIXL_ASSERT(macro_m != NULL);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015364 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15365 START();
15366
15367 // If the input and result types have a different size, the instruction
15368 // options on elements of the largest specified type is determined by the
15369 // larger type.
15370 int lane_size_in_bits =
15371 std::max(dst_type_size_in_bits, src_type_size_in_bits);
15372
15373 ZRegister zd_all_active = z25;
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015374 ZRegister zd_merging = z26;
TatWai Chongdb7437c2020-01-09 17:44:10 -080015375 ZRegister zn = z27;
15376
15377 uint64_t zn_rawbits[N];
15378 FPToRawbitsWithSize(zn_inputs, zn_rawbits, src_type_size_in_bits);
15379 InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_rawbits);
15380
15381 PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15382 __ Ptrue(pg_all_active);
15383
Josh Sorefb43d6ef2022-08-03 12:47:14 -040015384 // Test floating-point conversions with all lanes activated.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015385 (masm.*macro_m)(zd_all_active.WithLaneSize(dst_type_size_in_bits),
15386 pg_all_active.Merging(),
15387 zn.WithLaneSize(src_type_size_in_bits));
TatWai Chongdb7437c2020-01-09 17:44:10 -080015388
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015389 PRegisterWithLaneSize pg_merging = p1.WithLaneSize(lane_size_in_bits);
15390 Initialise(&masm, pg_merging, pg_inputs);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015391
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015392 __ Dup(zd_merging.VnD(), 0x0bad0bad0bad0bad);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015393
15394 // Use the same `zn` inputs to test floating-point conversions but partial
15395 // lanes are set inactive.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015396 (masm.*macro_m)(zd_merging.WithLaneSize(dst_type_size_in_bits),
15397 pg_merging.Merging(),
15398 zn.WithLaneSize(src_type_size_in_bits));
15399
15400 ZRegister zd_zeroing = z24;
15401 PRegisterWithLaneSize pg_zeroing = p1.WithLaneSize(lane_size_in_bits);
15402 Initialise(&masm, pg_zeroing, pg_inputs);
15403
15404 if (macro_z != NULL) {
15405 __ Dup(zd_zeroing.VnD(), 0x0bad0bad0bad0bad);
15406 (masm.*macro_z)(zd_zeroing.WithLaneSize(dst_type_size_in_bits),
15407 pg_zeroing.Zeroing(),
15408 zn.WithLaneSize(src_type_size_in_bits));
15409 }
TatWai Chongdb7437c2020-01-09 17:44:10 -080015410
15411 END();
15412
15413 if (CAN_RUN()) {
15414 RUN();
15415
15416 ASSERT_EQUAL_SVE(zd_expected_all_active,
15417 zd_all_active.WithLaneSize(lane_size_in_bits));
15418
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015419 uint64_t zd_expected_merging[N];
TatWai Chongdb7437c2020-01-09 17:44:10 -080015420 for (unsigned i = 0; i < N; i++) {
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015421 zd_expected_merging[i] =
TatWai Chongdb7437c2020-01-09 17:44:10 -080015422 pg_inputs[i] ? zd_expected_all_active[i]
15423 : 0x0bad0bad0bad0bad & GetUintMask(lane_size_in_bits);
15424 }
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015425 ASSERT_EQUAL_SVE(zd_expected_merging,
15426 zd_merging.WithLaneSize(lane_size_in_bits));
15427
15428 if (macro_z != NULL) {
15429 uint64_t zd_expected_zeroing[N] = {0};
15430 for (unsigned i = 0; i < N; i++) {
15431 if (pg_inputs[i]) {
15432 zd_expected_zeroing[i] = zd_expected_all_active[i];
15433 }
15434 }
15435 ASSERT_EQUAL_SVE(zd_expected_zeroing,
15436 zd_zeroing.WithLaneSize(lane_size_in_bits));
15437 }
TatWai Chongdb7437c2020-01-09 17:44:10 -080015438 }
15439}
15440
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015441template <typename F, size_t N>
15442static void TestFcvtzHelper(Test* config,
15443 FcvtFrintMFn macro_m,
15444 int dst_type_size_in_bits,
15445 int src_type_size_in_bits,
15446 const F (&zn_inputs)[N],
15447 const int (&pg_inputs)[N],
15448 const uint64_t (&zd_expected_all_active)[N]) {
15449 TestFcvtFrintHelper(config,
15450 macro_m,
15451 // Fcvt variants have no zeroing predication form.
15452 NULL,
15453 dst_type_size_in_bits,
15454 src_type_size_in_bits,
15455 zn_inputs,
15456 pg_inputs,
15457 zd_expected_all_active);
15458}
15459
TatWai Chongdb7437c2020-01-09 17:44:10 -080015460TEST_SVE(fcvtzs_fcvtzu_float16) {
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015461 const double h_max_float16 = 0x7ff0; // Largest float16 == INT16_MAX.
TatWai Chongdb7437c2020-01-09 17:44:10 -080015462 const double h_min_float16 = -h_max_float16; // Smallest float16 > INT16_MIN.
15463 const double largest_float16 = 0xffe0; // 65504
15464 const double smallest_float16 = -largest_float16;
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015465 const double h_max_int_add_one = 0x8000;
TatWai Chongdb7437c2020-01-09 17:44:10 -080015466
15467 double zn_inputs[] = {1.0,
15468 1.1,
15469 1.5,
15470 -1.5,
15471 h_max_float16,
15472 h_min_float16,
15473 largest_float16,
15474 smallest_float16,
15475 kFP64PositiveInfinity,
15476 kFP64NegativeInfinity,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015477 h_max_int_add_one};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015478
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015479 int pg_inputs[] = {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015480
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015481 uint64_t expected_fcvtzs_fp162h[] =
15482 {1, 1, 1, 0xffff, 0x7ff0, 0x8010, 0x7fff, 0x8000, 0x7fff, 0x8000, 0x7fff};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015483
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015484 uint64_t expected_fcvtzu_fp162h[] =
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015485 {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffff, 0, 0x8000};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015486
15487 // Float16 to 16-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015488 TestFcvtzHelper(config,
15489 &MacroAssembler::Fcvtzs,
15490 kHRegSize,
15491 kHRegSize,
15492 zn_inputs,
15493 pg_inputs,
15494 expected_fcvtzs_fp162h);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015495
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015496 TestFcvtzHelper(config,
15497 &MacroAssembler::Fcvtzu,
15498 kHRegSize,
15499 kHRegSize,
15500 zn_inputs,
15501 pg_inputs,
15502 expected_fcvtzu_fp162h);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015503
15504 uint64_t expected_fcvtzs_fp162w[] = {1,
15505 1,
15506 1,
15507 0xffffffff,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015508 0x7ff0,
15509 0xffff8010,
TatWai Chongdb7437c2020-01-09 17:44:10 -080015510 0xffe0,
15511 0xffff0020,
15512 0x7fffffff,
15513 0x80000000,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015514 0x8000};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015515
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015516 uint64_t expected_fcvtzu_fp162w[] =
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015517 {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffff, 0, 0x8000};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015518
15519 // Float16 to 32-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015520 TestFcvtzHelper(config,
15521 &MacroAssembler::Fcvtzs,
15522 kSRegSize,
15523 kHRegSize,
15524 zn_inputs,
15525 pg_inputs,
15526 expected_fcvtzs_fp162w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015527
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015528 TestFcvtzHelper(config,
15529 &MacroAssembler::Fcvtzu,
15530 kSRegSize,
15531 kHRegSize,
15532 zn_inputs,
15533 pg_inputs,
15534 expected_fcvtzu_fp162w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015535
15536 uint64_t expected_fcvtzs_fp162x[] = {1,
15537 1,
15538 1,
15539 0xffffffffffffffff,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015540 0x7ff0,
15541 0xffffffffffff8010,
TatWai Chongdb7437c2020-01-09 17:44:10 -080015542 0xffe0,
15543 0xffffffffffff0020,
15544 0x7fffffffffffffff,
15545 0x8000000000000000,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015546 0x8000};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015547
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015548 uint64_t expected_fcvtzu_fp162x[] =
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015549 {1, 1, 1, 0, 0x7ff0, 0, 0xffe0, 0, 0xffffffffffffffff, 0, 0x8000};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015550
15551 // Float16 to 64-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015552 TestFcvtzHelper(config,
15553 &MacroAssembler::Fcvtzs,
15554 kDRegSize,
15555 kHRegSize,
15556 zn_inputs,
15557 pg_inputs,
15558 expected_fcvtzs_fp162x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015559
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015560 TestFcvtzHelper(config,
15561 &MacroAssembler::Fcvtzu,
15562 kDRegSize,
15563 kHRegSize,
15564 zn_inputs,
15565 pg_inputs,
15566 expected_fcvtzu_fp162x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015567}
15568
15569TEST_SVE(fcvtzs_fcvtzu_float) {
15570 const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
15571 const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
15572 const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
15573 const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015574 const double w_min_int_add_one = 0x80000000;
15575 const double x_max_int_add_one = 0x80000000'00000000;
TatWai Chongdb7437c2020-01-09 17:44:10 -080015576
TatWai Chongdb7437c2020-01-09 17:44:10 -080015577 double zn_inputs[] = {1.0,
15578 1.1,
15579 1.5,
15580 -1.5,
15581 w_max_float,
15582 w_min_float,
15583 x_max_float,
15584 x_min_float,
15585 kFP64PositiveInfinity,
15586 kFP64NegativeInfinity,
TatWai Chongdb7437c2020-01-09 17:44:10 -080015587 w_min_int_add_one,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015588 x_max_int_add_one};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015589
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015590 int pg_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015591
15592 uint64_t expected_fcvtzs_s2w[] = {1,
15593 1,
15594 1,
15595 0xffffffff,
15596 0x7fffff80,
15597 0x80000080,
15598 0x7fffffff,
15599 0x80000000,
15600 0x7fffffff,
15601 0x80000000,
15602 0x7fffffff,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015603 0x7fffffff};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015604
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015605 uint64_t expected_fcvtzu_s2w[] = {1,
15606 1,
15607 1,
15608 0,
15609 0x7fffff80,
15610 0,
15611 0xffffffff,
15612 0,
15613 0xffffffff,
15614 0,
15615 0x80000000,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015616 0xffffffff};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015617
15618 // Float to 32-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015619 TestFcvtzHelper(config,
15620 &MacroAssembler::Fcvtzs,
15621 kSRegSize,
15622 kSRegSize,
15623 zn_inputs,
15624 pg_inputs,
15625 expected_fcvtzs_s2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015626
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015627 TestFcvtzHelper(config,
15628 &MacroAssembler::Fcvtzu,
15629 kSRegSize,
15630 kSRegSize,
15631 zn_inputs,
15632 pg_inputs,
15633 expected_fcvtzu_s2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015634
15635 uint64_t expected_fcvtzs_s2x[] = {1,
15636 1,
15637 1,
15638 0xffffffffffffffff,
15639 0x7fffff80,
15640 0xffffffff80000080,
15641 0x7fffff8000000000,
15642 0x8000008000000000,
15643 0x7fffffffffffffff,
15644 0x8000000000000000,
15645 0x80000000,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015646 0x7fffffffffffffff};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015647
15648 uint64_t expected_fcvtzu_s2x[] = {1,
15649 1,
15650 1,
15651 0,
15652 0x7fffff80,
15653 0,
15654 0x7fffff8000000000,
15655 0,
15656 0xffffffffffffffff,
15657 0,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015658 0x80000000,
15659 0x8000000000000000};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015660
15661 // Float to 64-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015662 TestFcvtzHelper(config,
15663 &MacroAssembler::Fcvtzs,
15664 kDRegSize,
15665 kSRegSize,
15666 zn_inputs,
15667 pg_inputs,
15668 expected_fcvtzs_s2x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015669
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015670 TestFcvtzHelper(config,
15671 &MacroAssembler::Fcvtzu,
15672 kDRegSize,
15673 kSRegSize,
15674 zn_inputs,
15675 pg_inputs,
15676 expected_fcvtzu_s2x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015677}
15678
15679TEST_SVE(fcvtzs_fcvtzu_double) {
TatWai Chongdb7437c2020-01-09 17:44:10 -080015680 const double w_max_float = 0x7fffff80; // Largest float < INT32_MAX.
15681 const double w_min_float = -w_max_float; // Smallest float > INT32_MIN.
15682 const double x_max_float = 0x7fffff8000000000; // Largest float < INT64_MAX.
15683 const double x_min_float = -x_max_float; // Smallest float > INT64_MIN.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015684 const double w_max_double = kWMaxInt; // Largest double == INT32_MAX.
15685 const double w_min_double = -w_max_double; // Smallest double > INT32_MIN.
15686 const double x_max_double =
15687 0x7ffffffffffffc00; // Largest double < INT64_MAX.
15688 const double x_min_double = -x_max_double; // Smallest double > INT64_MIN.
TatWai Chongdb7437c2020-01-09 17:44:10 -080015689 const double w_max_int_sub_one = kWMaxInt - 1;
15690 const double w_min_int_add_one = kWMinInt + 1;
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015691 const double w_max_int_add_one = 0x80000000;
15692 const double x_max_int_add_one = 0x80000000'00000000;
TatWai Chongdb7437c2020-01-09 17:44:10 -080015693
15694 double zn_inputs[] = {1.0,
15695 1.1,
15696 1.5,
15697 -1.5,
15698 w_max_float,
15699 w_min_float,
15700 x_max_float,
15701 x_min_float,
15702 w_max_double,
15703 w_min_double,
15704 x_max_double,
15705 x_min_double,
15706 kFP64PositiveInfinity,
15707 kFP64NegativeInfinity,
15708 w_max_int_sub_one,
15709 w_min_int_add_one,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015710 w_max_int_add_one,
15711 x_max_int_add_one};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015712
15713 int pg_inputs[] = {1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0};
15714
15715 uint64_t expected_fcvtzs_d2w[] = {1,
15716 1,
15717 1,
15718 0xffffffffffffffff,
15719 0x7fffff80,
15720 0xffffffff80000080,
15721 0x7fffffff,
15722 0xffffffff80000000,
15723 0x7fffffff,
15724 0xffffffff80000001,
15725 0x7fffffff,
15726 0xffffffff80000000,
15727 0x7fffffff,
15728 0xffffffff80000000,
15729 0x7ffffffe,
15730 0xffffffff80000001,
15731 0x7fffffff,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015732 0x7fffffff};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015733
15734 uint64_t expected_fcvtzu_d2w[] = {1,
15735 1,
15736 1,
15737 0,
15738 0x7fffff80,
15739 0,
15740 0xffffffff,
15741 0,
15742 0x7fffffff,
15743 0,
15744 0xffffffff,
15745 0,
15746 0xffffffff,
15747 0,
15748 0x7ffffffe,
15749 0,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015750 0x80000000,
15751 0xffffffff};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015752
15753 // Double to 32-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015754 TestFcvtzHelper(config,
15755 &MacroAssembler::Fcvtzs,
15756 kSRegSize,
15757 kDRegSize,
15758 zn_inputs,
15759 pg_inputs,
15760 expected_fcvtzs_d2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015761
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015762 TestFcvtzHelper(config,
15763 &MacroAssembler::Fcvtzu,
15764 kSRegSize,
15765 kDRegSize,
15766 zn_inputs,
15767 pg_inputs,
15768 expected_fcvtzu_d2w);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015769
15770 uint64_t expected_fcvtzs_d2x[] = {1,
15771 1,
15772 1,
15773 0xffffffffffffffff,
15774 0x7fffff80,
15775 0xffffffff80000080,
15776 0x7fffff8000000000,
15777 0x8000008000000000,
15778 0x7fffffff,
15779 0xffffffff80000001,
15780 0x7ffffffffffffc00,
15781 0x8000000000000400,
15782 0x7fffffffffffffff,
15783 0x8000000000000000,
15784 0x7ffffffe,
15785 0xffffffff80000001,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015786 0x80000000,
15787 0x7fffffffffffffff};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015788
15789 uint64_t expected_fcvtzu_d2x[] = {1,
15790 1,
15791 1,
15792 0,
15793 0x7fffff80,
15794 0,
15795 0x7fffff8000000000,
15796 0,
15797 0x7fffffff,
15798 0,
15799 0x7ffffffffffffc00,
15800 0,
15801 0xffffffffffffffff,
15802 0,
15803 0x000000007ffffffe,
15804 0,
Jacob Bramleyf73036b2020-11-04 09:06:03 +000015805 0x80000000,
15806 0x8000000000000000};
TatWai Chongdb7437c2020-01-09 17:44:10 -080015807
15808 // Double to 64-bit integers.
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015809 TestFcvtzHelper(config,
15810 &MacroAssembler::Fcvtzs,
15811 kDRegSize,
15812 kDRegSize,
15813 zn_inputs,
15814 pg_inputs,
15815 expected_fcvtzs_d2x);
TatWai Chongdb7437c2020-01-09 17:44:10 -080015816
TatWai Chongf07b8ce2020-02-17 00:05:54 -080015817 TestFcvtzHelper(config,
15818 &MacroAssembler::Fcvtzu,
15819 kDRegSize,
15820 kDRegSize,
15821 zn_inputs,
15822 pg_inputs,
15823 expected_fcvtzu_d2x);
15824}
15825
15826template <typename F, size_t N>
15827static void TestFrintHelper(Test* config,
15828 FcvtFrintMFn macro_m,
15829 FcvtFrintZFn macro_z,
15830 int lane_size_in_bits,
15831 const F (&zn_inputs)[N],
15832 const int (&pg_inputs)[N],
15833 const F (&zd_expected)[N]) {
15834 uint64_t zd_expected_rawbits[N];
15835 FPToRawbitsWithSize(zd_expected, zd_expected_rawbits, lane_size_in_bits);
15836 TestFcvtFrintHelper(config,
15837 macro_m,
15838 macro_z,
15839 lane_size_in_bits,
15840 lane_size_in_bits,
15841 zn_inputs,
15842 pg_inputs,
15843 zd_expected_rawbits);
15844}
15845
15846TEST_SVE(frint) {
15847 const double inf_pos = kFP64PositiveInfinity;
15848 const double inf_neg = kFP64NegativeInfinity;
15849
15850 double zn_inputs[] =
15851 {1.1, 1.5, 1.9, 2.5, -1.5, -2.5, 0.0, -0.0, -0.2, inf_pos, inf_neg};
15852 double zd_expected_a[] =
15853 {1.0, 2.0, 2.0, 3.0, -2.0, -3.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15854 double zd_expected_i[] =
15855 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15856 double zd_expected_m[] =
15857 {1.0, 1.0, 1.0, 2.0, -2.0, -3.0, 0.0, -0.0, -1.0, inf_pos, inf_neg};
15858 double zd_expected_n[] =
15859 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15860 double zd_expected_p[] =
15861 {2.0, 2.0, 2.0, 3.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15862 double zd_expected_x[] =
15863 {1.0, 2.0, 2.0, 2.0, -2.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15864 double zd_expected_z[] =
15865 {1.0, 1.0, 1.0, 2.0, -1.0, -2.0, 0.0, -0.0, -0.0, inf_pos, inf_neg};
15866
15867 int pg_inputs[] = {0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0};
15868
15869 struct TestDataSet {
15870 FcvtFrintMFn macro_m; // merging form.
15871 FcvtFrintZFn macro_z; // zeroing form.
15872 double (&expected)[11];
15873 };
15874
15875 TestDataSet test_data[] =
15876 {{&MacroAssembler::Frinta, &MacroAssembler::Frinta, zd_expected_a},
15877 {&MacroAssembler::Frinti, &MacroAssembler::Frinti, zd_expected_i},
15878 {&MacroAssembler::Frintm, &MacroAssembler::Frintm, zd_expected_m},
15879 {&MacroAssembler::Frintn, &MacroAssembler::Frintn, zd_expected_n},
15880 {&MacroAssembler::Frintp, &MacroAssembler::Frintp, zd_expected_p},
15881 {&MacroAssembler::Frintx, &MacroAssembler::Frintx, zd_expected_x},
15882 {&MacroAssembler::Frintz, &MacroAssembler::Frintz, zd_expected_z}};
15883
15884 unsigned lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
15885
15886 for (size_t i = 0; i < sizeof(test_data) / sizeof(TestDataSet); i++) {
15887 for (size_t j = 0; j < ArrayLength(lane_sizes); j++) {
15888 TestFrintHelper(config,
15889 test_data[i].macro_m,
15890 test_data[i].macro_z,
15891 lane_sizes[j],
15892 zn_inputs,
15893 pg_inputs,
15894 test_data[i].expected);
15895 }
15896 }
TatWai Chongdb7437c2020-01-09 17:44:10 -080015897}
15898
TatWai Chong31cd6a02020-01-10 13:03:26 -080015899struct CvtfTestDataSet {
15900 uint64_t int_value;
15901 uint64_t scvtf_result;
15902 uint64_t ucvtf_result;
15903};
15904
15905template <size_t N>
15906static void TestUScvtfHelper(Test* config,
15907 int dst_type_size_in_bits,
15908 int src_type_size_in_bits,
15909 const int (&pg_inputs)[N],
15910 const CvtfTestDataSet (&data_set)[N]) {
15911 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
15912 START();
15913
15914 // Unpack the data from the array of struct into individual arrays that can
15915 // simplify the testing.
15916 uint64_t zn_inputs[N];
15917 uint64_t expected_zd_scvtf_all_active[N];
15918 uint64_t expected_zd_ucvtf_all_active[N];
15919 for (size_t i = 0; i < N; i++) {
15920 zn_inputs[i] = data_set[i].int_value;
15921 expected_zd_scvtf_all_active[i] = data_set[i].scvtf_result;
15922 expected_zd_ucvtf_all_active[i] = data_set[i].ucvtf_result;
15923 }
15924
15925 // If the input and result types have a different size, the instruction
15926 // operates on elements of the largest specified type.
15927 int lane_size_in_bits =
15928 std::max(dst_type_size_in_bits, src_type_size_in_bits);
15929
15930 ZRegister zd_scvtf_all_active = z25;
15931 ZRegister zd_ucvtf_all_active = z26;
15932 ZRegister zn = z27;
15933 InsrHelper(&masm, zn.WithLaneSize(lane_size_in_bits), zn_inputs);
15934
15935 PRegisterWithLaneSize pg_all_active = p0.WithLaneSize(lane_size_in_bits);
15936 __ Ptrue(pg_all_active);
15937
Josh Sorefb43d6ef2022-08-03 12:47:14 -040015938 // Test integer conversions with all lanes activated.
TatWai Chong31cd6a02020-01-10 13:03:26 -080015939 __ Scvtf(zd_scvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15940 pg_all_active.Merging(),
15941 zn.WithLaneSize(src_type_size_in_bits));
15942 __ Ucvtf(zd_ucvtf_all_active.WithLaneSize(dst_type_size_in_bits),
15943 pg_all_active.Merging(),
15944 zn.WithLaneSize(src_type_size_in_bits));
15945
15946 ZRegister zd_scvtf_merged = z23;
15947 ZRegister zd_ucvtf_merged = z24;
15948
15949 PRegisterWithLaneSize pg_merged = p1.WithLaneSize(lane_size_in_bits);
15950 Initialise(&masm, pg_merged, pg_inputs);
15951
15952 uint64_t snan;
15953 switch (lane_size_in_bits) {
15954 case kHRegSize:
15955 snan = 0x7c11;
15956 break;
15957 case kSRegSize:
15958 snan = 0x7f951111;
15959 break;
15960 case kDRegSize:
15961 snan = 0x7ff5555511111111;
15962 break;
15963 }
15964 __ Dup(zd_scvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15965 __ Dup(zd_ucvtf_merged.WithLaneSize(lane_size_in_bits), snan);
15966
15967 // Use the same `zn` inputs to test integer conversions but some lanes are set
15968 // inactive.
15969 __ Scvtf(zd_scvtf_merged.WithLaneSize(dst_type_size_in_bits),
15970 pg_merged.Merging(),
15971 zn.WithLaneSize(src_type_size_in_bits));
15972 __ Ucvtf(zd_ucvtf_merged.WithLaneSize(dst_type_size_in_bits),
15973 pg_merged.Merging(),
15974 zn.WithLaneSize(src_type_size_in_bits));
15975
15976 END();
15977
15978 if (CAN_RUN()) {
15979 RUN();
15980
15981 ASSERT_EQUAL_SVE(expected_zd_scvtf_all_active,
15982 zd_scvtf_all_active.WithLaneSize(lane_size_in_bits));
15983 ASSERT_EQUAL_SVE(expected_zd_ucvtf_all_active,
15984 zd_ucvtf_all_active.WithLaneSize(lane_size_in_bits));
15985
15986 uint64_t expected_zd_scvtf_merged[N];
15987 for (size_t i = 0; i < N; i++) {
15988 expected_zd_scvtf_merged[i] =
15989 pg_inputs[i] ? expected_zd_scvtf_all_active[i] : snan;
15990 }
15991 ASSERT_EQUAL_SVE(expected_zd_scvtf_merged,
15992 zd_scvtf_merged.WithLaneSize(lane_size_in_bits));
15993
15994 uint64_t expected_zd_ucvtf_merged[N];
15995 for (size_t i = 0; i < N; i++) {
15996 expected_zd_ucvtf_merged[i] =
15997 pg_inputs[i] ? expected_zd_ucvtf_all_active[i] : snan;
15998 }
15999 ASSERT_EQUAL_SVE(expected_zd_ucvtf_merged,
16000 zd_ucvtf_merged.WithLaneSize(lane_size_in_bits));
16001 }
16002}
16003
16004TEST_SVE(scvtf_ucvtf_h_s_d_to_float16) {
16005 // clang-format off
16006 CvtfTestDataSet data_set_1[] = {
16007 // Simple conversions of positive numbers which require no rounding; the
Josh Sorefb43d6ef2022-08-03 12:47:14 -040016008 // results should not depend on the rounding mode, and ucvtf and scvtf should
TatWai Chong31cd6a02020-01-10 13:03:26 -080016009 // produce the same result.
16010 {0x0000, 0x0000, 0x0000},
16011 {0x0001, 0x3c00, 0x3c00},
16012 {0x0010, 0x4c00, 0x4c00},
16013 {0x0080, 0x5800, 0x5800},
16014 {0x0400, 0x6400, 0x6400},
16015 // Conversions which require rounding.
16016 {0x4000, 0x7400, 0x7400},
16017 {0x4001, 0x7400, 0x7400},
16018 // Round up to produce a result that's too big for the input to represent.
16019 {0x7ff0, 0x77ff, 0x77ff},
16020 {0x7ff1, 0x77ff, 0x77ff},
16021 {0x7ffe, 0x7800, 0x7800},
16022 {0x7fff, 0x7800, 0x7800}};
16023 int pg_1[] = {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
16024 TestUScvtfHelper(config, kHRegSize, kDRegSize, pg_1, data_set_1);
16025 TestUScvtfHelper(config, kHRegSize, kSRegSize, pg_1, data_set_1);
16026 TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_1, data_set_1);
16027
16028 CvtfTestDataSet data_set_2[] = {
16029 // Test mantissa extremities.
16030 {0x0401, 0x6401, 0x6401},
16031 {0x4020, 0x7402, 0x7402},
16032 // The largest int16_t that fits in a float16.
16033 {0xffef, 0xcc40, 0x7bff},
16034 // Values that would be negative if treated as an int16_t.
16035 {0xff00, 0xdc00, 0x7bf8},
16036 {0x8000, 0xf800, 0x7800},
16037 {0x8100, 0xf7f0, 0x7808},
16038 // Check for bit pattern reproduction.
16039 {0x0123, 0x5c8c, 0x5c8c},
16040 {0x0cde, 0x6a6f, 0x6a6f},
16041 // Simple conversions of negative int64_t values. These require no rounding,
16042 // and the results should not depend on the rounding mode.
16043 {0xf800, 0xe800, 0x7bc0},
16044 {0xfc00, 0xe400, 0x7be0},
16045 {0xc000, 0xf400, 0x7a00},
16046 // Check rounding of negative int16_t values.
16047 {0x8ffe, 0xf700, 0x7880},
16048 {0x8fff, 0xf700, 0x7880},
16049 {0xffee, 0xcc80, 0x7bff},
16050 {0xffef, 0xcc40, 0x7bff}};
16051 int pg_2[] = {1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1};
16052 // `32-bit to float16` and `64-bit to float16` of above tests has been tested
16053 // in `ucvtf` of `16-bit to float16`.
16054 TestUScvtfHelper(config, kHRegSize, kHRegSize, pg_2, data_set_2);
16055 // clang-format on
16056}
16057
16058TEST_SVE(scvtf_ucvtf_s_to_float) {
16059 // clang-format off
16060 int dst_lane_size = kSRegSize;
16061 int src_lane_size = kSRegSize;
16062
16063 // Simple conversions of positive numbers which require no rounding; the
Josh Sorefb43d6ef2022-08-03 12:47:14 -040016064 // results should not depend on the rounding mode, and ucvtf and scvtf should
TatWai Chong31cd6a02020-01-10 13:03:26 -080016065 // produce the same result.
16066 CvtfTestDataSet data_set_1[] = {
16067 {0x00000000, 0x00000000, 0x00000000},
16068 {0x00000001, 0x3f800000, 0x3f800000},
16069 {0x00004000, 0x46800000, 0x46800000},
16070 {0x00010000, 0x47800000, 0x47800000},
16071 {0x40000000, 0x4e800000, 0x4e800000}};
16072 int pg_1[] = {1, 0, 1, 0, 0};
16073 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16074
16075 CvtfTestDataSet data_set_2[] = {
16076 // Test mantissa extremities.
16077 {0x00800001, 0x4b000001, 0x4b000001},
16078 {0x40400000, 0x4e808000, 0x4e808000},
16079 // The largest int32_t that fits in a double.
16080 {0x7fffff80, 0x4effffff, 0x4effffff},
16081 // Values that would be negative if treated as an int32_t.
16082 {0xffffffff, 0xbf800000, 0x4f800000},
16083 {0xffffff00, 0xc3800000, 0x4f7fffff},
16084 {0x80000000, 0xcf000000, 0x4f000000},
16085 {0x80000001, 0xcf000000, 0x4f000000},
16086 // Check for bit pattern reproduction.
16087 {0x089abcde, 0x4d09abce, 0x4d09abce},
16088 {0x12345678, 0x4d91a2b4, 0x4d91a2b4}};
16089 int pg_2[] = {1, 0, 1, 0, 1, 1, 1, 0, 0};
16090 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16091
16092 // Simple conversions of negative int32_t values. These require no rounding,
16093 // and the results should not depend on the rounding mode.
16094 CvtfTestDataSet data_set_3[] = {
16095 {0xffffc000, 0xc6800000, 0x4f7fffc0},
16096 {0xffff0000, 0xc7800000, 0x4f7fff00},
16097 {0xc0000000, 0xce800000, 0x4f400000},
16098 // Conversions which require rounding.
16099 {0x72800000, 0x4ee50000, 0x4ee50000},
16100 {0x72800001, 0x4ee50000, 0x4ee50000},
16101 {0x73000000, 0x4ee60000, 0x4ee60000},
16102 // Check rounding of negative int32_t values.
16103 {0x80000140, 0xcefffffe, 0x4f000001},
16104 {0x80000141, 0xcefffffd, 0x4f000001},
16105 {0x80000180, 0xcefffffd, 0x4f000002},
16106 // Round up to produce a result that's too big for the input to represent.
16107 {0x7fffffc0, 0x4f000000, 0x4f000000},
16108 {0x7fffffff, 0x4f000000, 0x4f000000}};
16109 int pg_3[] = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0};
16110 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16111 // clang-format on
16112}
16113
16114TEST_SVE(scvtf_ucvtf_d_to_float) {
16115 // clang-format off
16116 int dst_lane_size = kSRegSize;
16117 int src_lane_size = kDRegSize;
16118
16119 // Simple conversions of positive numbers which require no rounding; the
Josh Sorefb43d6ef2022-08-03 12:47:14 -040016120 // results should not depend on the rounding mode, and ucvtf and scvtf should
TatWai Chong31cd6a02020-01-10 13:03:26 -080016121 // produce the same result.
16122 CvtfTestDataSet data_set_1[] = {
16123 {0x0000000000000000, 0x00000000, 0x00000000},
16124 {0x0000000000000001, 0x3f800000, 0x3f800000},
16125 {0x0000000040000000, 0x4e800000, 0x4e800000},
16126 {0x0000000100000000, 0x4f800000, 0x4f800000},
16127 {0x4000000000000000, 0x5e800000, 0x5e800000}};
16128 int pg_1[] = {1, 1, 0, 1, 0};
16129 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16130
16131 CvtfTestDataSet data_set_2[] = {
16132 // Test mantissa extremities.
16133 {0x0010000000000001, 0x59800000, 0x59800000},
16134 {0x4008000000000000, 0x5e801000, 0x5e801000},
16135 // The largest int32_t that fits in a float.
16136 {0x000000007fffff80, 0x4effffff, 0x4effffff},
16137 // Values that would be negative if treated as an int32_t.
16138 {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16139 {0x00000000ffffff00, 0x4f7fffff, 0x4f7fffff},
16140 {0x0000000080000000, 0x4f000000, 0x4f000000},
16141 {0x0000000080000100, 0x4f000001, 0x4f000001},
16142 // The largest int64_t that fits in a float.
16143 {0x7fffff8000000000, 0x5effffff, 0x5effffff},
16144 // Check for bit pattern reproduction.
16145 {0x0123456789abcde0, 0x5b91a2b4, 0x5b91a2b4},
16146 {0x0000000000876543, 0x4b076543, 0x4b076543}};
16147 int pg_2[] = {1, 0, 0, 0, 1, 0, 0, 0, 0, 1};
16148 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16149
16150 CvtfTestDataSet data_set_3[] = {
16151 // Simple conversions of negative int64_t values. These require no rounding,
16152 // and the results should not depend on the rounding mode.
16153 {0xffffffffc0000000, 0xce800000, 0x5f800000},
16154 {0xffffffff00000000, 0xcf800000, 0x5f800000},
16155 {0xc000000000000000, 0xde800000, 0x5f400000},
16156 // Conversions which require rounding.
16157 {0x0000800002800000, 0x57000002, 0x57000002},
16158 {0x0000800002800001, 0x57000003, 0x57000003},
16159 {0x0000800003000000, 0x57000003, 0x57000003},
16160 // Check rounding of negative int64_t values.
16161 {0x8000014000000000, 0xdefffffe, 0x5f000001},
16162 {0x8000014000000001, 0xdefffffd, 0x5f000001},
16163 {0x8000018000000000, 0xdefffffd, 0x5f000002},
16164 // Round up to produce a result that's too big for the input to represent.
16165 {0x00000000ffffff80, 0x4f800000, 0x4f800000},
16166 {0x00000000ffffffff, 0x4f800000, 0x4f800000},
16167 {0xffffff8000000000, 0xd3000000, 0x5f800000},
16168 {0xffffffffffffffff, 0xbf800000, 0x5f800000}};
16169 int pg_3[] = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1};
16170 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16171 // clang-format on
16172}
16173
16174TEST_SVE(scvtf_ucvtf_d_to_double) {
16175 // clang-format off
16176 int dst_lane_size = kDRegSize;
16177 int src_lane_size = kDRegSize;
16178
16179 // Simple conversions of positive numbers which require no rounding; the
Josh Sorefb43d6ef2022-08-03 12:47:14 -040016180 // results should not depend on the rounding mode, and ucvtf and scvtf should
TatWai Chong31cd6a02020-01-10 13:03:26 -080016181 // produce the same result.
16182 CvtfTestDataSet data_set_1[] = {
16183 {0x0000000000000000, 0x0000000000000000, 0x0000000000000000},
16184 {0x0000000000000001, 0x3ff0000000000000, 0x3ff0000000000000},
16185 {0x0000000040000000, 0x41d0000000000000, 0x41d0000000000000},
16186 {0x0000000100000000, 0x41f0000000000000, 0x41f0000000000000},
16187 {0x4000000000000000, 0x43d0000000000000, 0x43d0000000000000}};
16188 int pg_1[] = {0, 1, 1, 0, 0};
16189 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16190
16191 CvtfTestDataSet data_set_2[] = {
16192 // Test mantissa extremities.
16193 {0x0010000000000001, 0x4330000000000001, 0x4330000000000001},
16194 {0x4008000000000000, 0x43d0020000000000, 0x43d0020000000000},
16195 // The largest int32_t that fits in a double.
16196 {0x000000007fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16197 // Values that would be negative if treated as an int32_t.
16198 {0x00000000ffffffff, 0x41efffffffe00000, 0x41efffffffe00000},
16199 {0x0000000080000000, 0x41e0000000000000, 0x41e0000000000000},
16200 {0x0000000080000001, 0x41e0000000200000, 0x41e0000000200000},
16201 // The largest int64_t that fits in a double.
16202 {0x7ffffffffffffc00, 0x43dfffffffffffff, 0x43dfffffffffffff},
16203 // Check for bit pattern reproduction.
16204 {0x0123456789abcde0, 0x43723456789abcde, 0x43723456789abcde},
16205 {0x0000000012345678, 0x41b2345678000000, 0x41b2345678000000}};
16206 int pg_2[] = {1, 1, 1, 1, 1, 0, 0, 0, 0};
16207 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16208
16209 CvtfTestDataSet data_set_3[] = {
16210 // Simple conversions of negative int64_t values. These require no rounding,
16211 // and the results should not depend on the rounding mode.
16212 {0xffffffffc0000000, 0xc1d0000000000000, 0x43effffffff80000},
16213 {0xffffffff00000000, 0xc1f0000000000000, 0x43efffffffe00000},
16214 {0xc000000000000000, 0xc3d0000000000000, 0x43e8000000000000},
16215 // Conversions which require rounding.
16216 {0x1000000000000280, 0x43b0000000000002, 0x43b0000000000002},
16217 {0x1000000000000281, 0x43b0000000000003, 0x43b0000000000003},
16218 {0x1000000000000300, 0x43b0000000000003, 0x43b0000000000003},
16219 // Check rounding of negative int64_t values.
16220 {0x8000000000000a00, 0xc3dffffffffffffe, 0x43e0000000000001},
16221 {0x8000000000000a01, 0xc3dffffffffffffd, 0x43e0000000000001},
16222 {0x8000000000000c00, 0xc3dffffffffffffd, 0x43e0000000000002},
16223 // Round up to produce a result that's too big for the input to represent.
16224 {0x7ffffffffffffe00, 0x43e0000000000000, 0x43e0000000000000},
16225 {0x7fffffffffffffff, 0x43e0000000000000, 0x43e0000000000000},
16226 {0xfffffffffffffc00, 0xc090000000000000, 0x43f0000000000000},
16227 {0xffffffffffffffff, 0xbff0000000000000, 0x43f0000000000000}};
16228 int pg_3[] = {1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0};
16229 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_3, data_set_3);
16230 // clang-format on
16231}
16232
16233TEST_SVE(scvtf_ucvtf_s_to_double) {
16234 // clang-format off
16235 int dst_lane_size = kDRegSize;
16236 int src_lane_size = kSRegSize;
16237
16238 // Simple conversions of positive numbers which require no rounding; the
Josh Sorefb43d6ef2022-08-03 12:47:14 -040016239 // results should not depend on the rounding mode, and ucvtf and scvtf should
TatWai Chong31cd6a02020-01-10 13:03:26 -080016240 // produce the same result.
16241 CvtfTestDataSet data_set_1[] = {
16242 {0x00000000, 0x0000000000000000, 0x0000000000000000},
16243 {0x00000001, 0x3ff0000000000000, 0x3ff0000000000000},
16244 {0x00004000, 0x40d0000000000000, 0x40d0000000000000},
16245 {0x00010000, 0x40f0000000000000, 0x40f0000000000000},
16246 {0x40000000, 0x41d0000000000000, 0x41d0000000000000}};
16247 int pg_1[] = {1, 0, 0, 0, 1};
16248 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_1, data_set_1);
16249
16250 CvtfTestDataSet data_set_2[] = {
16251 // Test mantissa extremities.
16252 {0x40000400, 0x41d0000100000000, 0x41d0000100000000},
16253 // The largest int32_t that fits in a double.
16254 {0x7fffffff, 0x41dfffffffc00000, 0x41dfffffffc00000},
16255 // Values that would be negative if treated as an int32_t.
16256 {0xffffffff, 0xbff0000000000000, 0x41efffffffe00000},
16257 {0x80000000, 0xc1e0000000000000, 0x41e0000000000000},
16258 {0x80000001, 0xc1dfffffffc00000, 0x41e0000000200000},
16259 // Check for bit pattern reproduction.
16260 {0x089abcde, 0x41a13579bc000000, 0x41a13579bc000000},
16261 {0x12345678, 0x41b2345678000000, 0x41b2345678000000},
16262 // Simple conversions of negative int32_t values. These require no rounding,
16263 // and the results should not depend on the rounding mode.
16264 {0xffffc000, 0xc0d0000000000000, 0x41effff800000000},
16265 {0xffff0000, 0xc0f0000000000000, 0x41efffe000000000},
16266 {0xc0000000, 0xc1d0000000000000, 0x41e8000000000000}};
16267 int pg_2[] = {1, 0, 1, 0, 0, 1, 1, 0, 1, 1};
16268 TestUScvtfHelper(config, dst_lane_size, src_lane_size, pg_2, data_set_2);
16269
16270 // Note that IEEE 754 double-precision format has 52-bits fraction, so all
16271 // 32-bits integers are representable in double.
16272 // clang-format on
16273}
16274
Martyn Capewell4a9829f2020-01-30 17:41:01 +000016275TEST_SVE(sve_fadda) {
Anton Kirilov279f08b2023-06-20 10:55:10 +010016276 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kFP);
Martyn Capewell4a9829f2020-01-30 17:41:01 +000016277 START();
16278
16279 __ Ptrue(p0.VnB());
16280 __ Pfalse(p1.VnB());
16281 __ Zip1(p1.VnH(), p0.VnH(), p1.VnH());
16282
16283 __ Index(z0.VnS(), 3, 3);
16284 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16285 __ Fmov(s2, 2.0);
16286 __ Fadda(s2, p0, s2, z0.VnS());
16287
16288 __ Index(z0.VnD(), -7, -7);
16289 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16290 __ Fmov(d3, 3.0);
16291 __ Fadda(d3, p0, d3, z0.VnD());
16292
16293 __ Index(z0.VnH(), 1, 1);
16294 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16295 __ Fmov(h4, 0);
16296 __ Fadda(h4, p1, h4, z0.VnH());
16297 END();
16298
16299 if (CAN_RUN()) {
16300 RUN();
16301 // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16302 int n = core.GetSVELaneCount(kSRegSize);
16303 ASSERT_EQUAL_FP32(2 + 3 * ((n + 1) * (n / 2)), s2);
16304
16305 n /= 2; // Half as many lanes.
16306 ASSERT_EQUAL_FP64(3 + -7 * ((n + 1) * (n / 2)), d3);
16307
16308 // Sum of first n odd numbers is n^2.
16309 n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
16310 ASSERT_EQUAL_FP16(Float16(n * n), h4);
16311 }
16312}
16313
Martyn Capewellac07af12019-12-02 14:55:05 +000016314TEST_SVE(sve_extract) {
16315 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16316 START();
16317
16318 __ Index(z0.VnB(), 0, 1);
16319
16320 __ Mov(z1, z0);
16321 __ Mov(z2, z0);
16322 __ Mov(z3, z0);
16323 __ Mov(z4, z0);
16324 __ Mov(z5, z0);
16325 __ Mov(z6, z0);
16326
16327 __ Ext(z1, z1, z0, 0);
16328 __ Ext(z2, z2, z0, 1);
16329 __ Ext(z3, z3, z0, 15);
16330 __ Ext(z4, z4, z0, 31);
16331 __ Ext(z5, z5, z0, 47);
16332 __ Ext(z6, z6, z0, 255);
16333
16334 END();
16335
16336 if (CAN_RUN()) {
16337 RUN();
16338
16339 ASSERT_EQUAL_SVE(z1, z0);
16340
16341 int lane_count = core.GetSVELaneCount(kBRegSize);
16342 if (lane_count == 16) {
16343 uint64_t z2_expected[] = {0x000f0e0d0c0b0a09, 0x0807060504030201};
16344 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16345 } else {
16346 uint64_t z2_expected[] = {0x100f0e0d0c0b0a09, 0x0807060504030201};
16347 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16348 }
16349
16350 if (lane_count == 16) {
16351 uint64_t z3_expected[] = {0x0e0d0c0b0a090807, 0x060504030201000f};
16352 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16353 } else {
16354 uint64_t z3_expected[] = {0x1e1d1c1b1a191817, 0x161514131211100f};
16355 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16356 }
16357
16358 if (lane_count < 32) {
16359 ASSERT_EQUAL_SVE(z4, z0);
16360 } else if (lane_count == 32) {
16361 uint64_t z4_expected[] = {0x0e0d0c0b0a090807, 0x060504030201001f};
16362 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16363 } else {
16364 uint64_t z4_expected[] = {0x2e2d2c2b2a292827, 0x262524232221201f};
16365 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16366 }
16367
16368 if (lane_count < 48) {
16369 ASSERT_EQUAL_SVE(z5, z0);
16370 } else if (lane_count == 48) {
16371 uint64_t z5_expected[] = {0x0e0d0c0b0a090807, 0x060504030201002f};
16372 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16373 } else {
16374 uint64_t z5_expected[] = {0x3e3d3c3b3a393837, 0x363534333231302f};
16375 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16376 }
16377
16378 if (lane_count < 256) {
16379 ASSERT_EQUAL_SVE(z6, z0);
16380 } else {
16381 uint64_t z6_expected[] = {0x0e0d0c0b0a090807, 0x06050403020100ff};
16382 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16383 }
16384 }
16385}
16386
Martyn Capewell894962f2020-02-05 15:46:44 +000016387TEST_SVE(sve_fp_paired_across) {
16388 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16389
16390 START();
16391
16392 __ Ptrue(p0.VnB());
16393 __ Pfalse(p1.VnB());
16394 __ Zip1(p2.VnS(), p0.VnS(), p1.VnS());
16395 __ Zip1(p3.VnD(), p0.VnD(), p1.VnD());
16396 __ Zip1(p4.VnH(), p0.VnH(), p1.VnH());
16397
16398 __ Index(z0.VnS(), 3, 3);
16399 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16400 __ Faddv(s1, p0, z0.VnS());
16401 __ Fminv(s2, p2, z0.VnS());
16402 __ Fmaxv(s3, p2, z0.VnS());
16403
16404 __ Index(z0.VnD(), -7, -7);
16405 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16406 __ Faddv(d4, p0, z0.VnD());
16407 __ Fminv(d5, p3, z0.VnD());
16408 __ Fmaxv(d6, p3, z0.VnD());
16409
16410 __ Index(z0.VnH(), 1, 1);
16411 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16412 __ Faddv(h7, p4, z0.VnH());
16413 __ Fminv(h8, p4, z0.VnH());
16414 __ Fmaxv(h9, p4, z0.VnH());
16415
16416 __ Dup(z10.VnH(), 0);
16417 __ Fdiv(z10.VnH(), p0.Merging(), z10.VnH(), z10.VnH());
16418 __ Insr(z10.VnH(), 0x5140);
16419 __ Insr(z10.VnH(), 0xd140);
16420 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 2);
16421 __ Fmaxnmv(h11, p0, z10.VnH());
16422 __ Fmaxnmv(h12, p4, z10.VnH());
16423 __ Fminnmv(h13, p0, z10.VnH());
16424 __ Fminnmv(h14, p4, z10.VnH());
16425
16426 __ Dup(z10.VnS(), 0);
16427 __ Fdiv(z10.VnS(), p0.Merging(), z10.VnS(), z10.VnS());
16428 __ Insr(z10.VnS(), 0x42280000);
16429 __ Insr(z10.VnS(), 0xc2280000);
16430 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 4);
16431 __ Fmaxnmv(s15, p0, z10.VnS());
16432 __ Fmaxnmv(s16, p2, z10.VnS());
16433 __ Fminnmv(s17, p0, z10.VnS());
16434 __ Fminnmv(s18, p2, z10.VnS());
16435
16436 __ Dup(z10.VnD(), 0);
16437 __ Fdiv(z10.VnD(), p0.Merging(), z10.VnD(), z10.VnD());
16438 __ Insr(z10.VnD(), 0x4045000000000000);
16439 __ Insr(z10.VnD(), 0xc045000000000000);
16440 __ Ext(z10.VnB(), z10.VnB(), z10.VnB(), 8);
16441 __ Fmaxnmv(d19, p0, z10.VnD());
16442 __ Fmaxnmv(d20, p3, z10.VnD());
16443 __ Fminnmv(d21, p0, z10.VnD());
16444 __ Fminnmv(d22, p3, z10.VnD());
16445 END();
16446
16447 if (CAN_RUN()) {
16448 RUN();
16449 // Sum of 1 .. n is n+1 * n/2, ie. n(n+1)/2.
16450 int n = core.GetSVELaneCount(kSRegSize);
16451 ASSERT_EQUAL_FP32(3 * ((n + 1) * (n / 2)), s1);
16452 ASSERT_EQUAL_FP32(3, s2);
16453 ASSERT_EQUAL_FP32(3 * n - 3, s3);
16454
16455 n /= 2; // Half as many lanes.
16456 ASSERT_EQUAL_FP64(-7 * ((n + 1) * (n / 2)), d4);
16457 ASSERT_EQUAL_FP64(-7 * (n - 1), d5);
16458 ASSERT_EQUAL_FP64(-7, d6);
16459
16460 // Sum of first n odd numbers is n^2.
16461 n = core.GetSVELaneCount(kHRegSize) / 2; // Half are odd numbers.
16462 ASSERT_EQUAL_FP16(Float16(n * n), h7);
16463 ASSERT_EQUAL_FP16(Float16(1), h8);
16464
16465 n = core.GetSVELaneCount(kHRegSize);
16466 ASSERT_EQUAL_FP16(Float16(n - 1), h9);
16467
16468 ASSERT_EQUAL_FP16(Float16(42), h11);
16469 ASSERT_EQUAL_FP16(Float16(42), h12);
16470 ASSERT_EQUAL_FP16(Float16(-42), h13);
16471 ASSERT_EQUAL_FP16(Float16(42), h14);
16472 ASSERT_EQUAL_FP32(42, s15);
16473 ASSERT_EQUAL_FP32(42, s16);
16474 ASSERT_EQUAL_FP32(-42, s17);
16475 ASSERT_EQUAL_FP32(42, s18);
16476 ASSERT_EQUAL_FP64(42, d19);
16477 ASSERT_EQUAL_FP64(42, d20);
16478 ASSERT_EQUAL_FP64(-42, d21);
16479 ASSERT_EQUAL_FP64(42, d22);
16480 }
16481}
16482
Martyn Capewell13050ca2020-02-11 16:43:40 +000016483TEST_SVE(sve_frecpe_frsqrte) {
16484 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16485
16486 START();
16487
16488 __ Ptrue(p0.VnB());
16489
16490 __ Index(z0.VnH(), 0, 1);
16491 __ Fdup(z1.VnH(), Float16(1));
16492 __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16493 __ Insr(z1.VnH(), 0);
16494 __ Frsqrte(z2.VnH(), z1.VnH());
16495 __ Frecpe(z1.VnH(), z1.VnH());
16496
16497 __ Index(z0.VnS(), 0, 1);
16498 __ Fdup(z3.VnS(), Float16(1));
16499 __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16500 __ Insr(z3.VnS(), 0);
16501 __ Frsqrte(z4.VnS(), z3.VnS());
16502 __ Frecpe(z3.VnS(), z3.VnS());
16503
16504 __ Index(z0.VnD(), 0, 1);
16505 __ Fdup(z5.VnD(), Float16(1));
16506 __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16507 __ Insr(z5.VnD(), 0);
16508 __ Frsqrte(z6.VnD(), z5.VnD());
16509 __ Frecpe(z5.VnD(), z5.VnD());
16510 END();
16511
16512 if (CAN_RUN()) {
16513 RUN();
16514 uint64_t z1_expected[] = {0x23fc27fc2bfc2ffc, 0x33fc37fc3bfc7c00};
16515 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16516 uint64_t z2_expected[] = {0x2ffc31a433fc35a4, 0x37fc39a43bfc7c00};
16517 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16518
16519 uint64_t z3_expected[] = {0x3e7f80003eff8000, 0x3f7f80007f800000};
16520 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16521 uint64_t z4_expected[] = {0x3eff80003f348000, 0x3f7f80007f800000};
16522 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16523
16524 uint64_t z5_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16525 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16526 uint64_t z6_expected[] = {0x3feff00000000000, 0x7ff0000000000000};
16527 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16528 }
16529}
16530
Martyn Capewellefd9dc72020-02-13 10:46:29 +000016531TEST_SVE(sve_frecps_frsqrts) {
16532 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16533
16534 START();
16535 __ Ptrue(p0.VnB());
16536
16537 __ Index(z0.VnH(), 0, -1);
16538 __ Fdup(z1.VnH(), Float16(1));
16539 __ Fscale(z1.VnH(), p0.Merging(), z1.VnH(), z0.VnH());
16540 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16541 __ Insr(z1.VnH(), 0);
16542 __ Frsqrts(z2.VnH(), z1.VnH(), z0.VnH());
16543 __ Frecps(z1.VnH(), z1.VnH(), z0.VnH());
16544
16545 __ Index(z0.VnS(), 0, -1);
16546 __ Fdup(z3.VnS(), Float16(1));
16547 __ Fscale(z3.VnS(), p0.Merging(), z3.VnS(), z0.VnS());
16548 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16549 __ Insr(z3.VnS(), 0);
16550 __ Frsqrts(z4.VnS(), z3.VnS(), z0.VnS());
16551 __ Frecps(z3.VnS(), z3.VnS(), z0.VnS());
16552
16553 __ Index(z0.VnD(), 0, -1);
16554 __ Fdup(z5.VnD(), Float16(1));
16555 __ Fscale(z5.VnD(), p0.Merging(), z5.VnD(), z0.VnD());
16556 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16557 __ Insr(z5.VnD(), 0);
16558 __ Frsqrts(z6.VnD(), z5.VnD(), z0.VnD());
16559 __ Frecps(z5.VnD(), z5.VnD(), z0.VnD());
16560 END();
16561
16562 if (CAN_RUN()) {
16563 RUN();
16564 uint64_t z1_expected[] = {0x4038406040a04100, 0x4180420042004000};
16565 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
16566 uint64_t z2_expected[] = {0x3e383e603ea03f00, 0x3f80400040003e00};
16567 ASSERT_EQUAL_SVE(z2_expected, z2.VnD());
16568
16569 uint64_t z3_expected[] = {0x4030000040400000, 0x4040000040000000};
16570 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16571 uint64_t z4_expected[] = {0x3ff0000040000000, 0x400000003fc00000};
16572 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16573
16574 uint64_t z5_expected[] = {0x4008000000000000, 0x4000000000000000};
16575 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16576 uint64_t z6_expected[] = {0x4000000000000000, 0x3ff8000000000000};
16577 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16578 }
16579}
16580
16581TEST_SVE(sve_ftsmul) {
16582 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16583
16584 START();
16585 __ Ptrue(p0.VnB());
16586
16587 __ Index(z0.VnH(), 0, 1);
16588 __ Rev(z1.VnH(), z0.VnH());
16589 __ Scvtf(z0.VnH(), p0.Merging(), z0.VnH());
16590 __ Dup(z2.VnH(), 0);
16591 __ Fdiv(z2.VnH(), p0.Merging(), z2.VnH(), z2.VnH());
16592 __ Ftsmul(z3.VnH(), z0.VnH(), z1.VnH());
16593 __ Ftsmul(z4.VnH(), z2.VnH(), z1.VnH());
16594
16595 __ Index(z0.VnS(), -7, 1);
16596 __ Rev(z1.VnS(), z0.VnS());
16597 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
16598 __ Dup(z2.VnS(), 0);
16599 __ Fdiv(z2.VnS(), p0.Merging(), z2.VnS(), z2.VnS());
16600 __ Ftsmul(z5.VnS(), z0.VnS(), z1.VnS());
16601 __ Ftsmul(z6.VnS(), z2.VnS(), z1.VnS());
16602
16603 __ Index(z0.VnD(), 2, -1);
16604 __ Rev(z1.VnD(), z0.VnD());
16605 __ Scvtf(z0.VnD(), p0.Merging(), z0.VnD());
16606 __ Dup(z2.VnD(), 0);
16607 __ Fdiv(z2.VnD(), p0.Merging(), z2.VnD(), z2.VnD());
16608 __ Ftsmul(z7.VnD(), z0.VnD(), z1.VnD());
16609 __ Ftsmul(z8.VnD(), z2.VnD(), z1.VnD());
16610 END();
16611
16612 if (CAN_RUN()) {
16613 RUN();
16614 uint64_t z3_expected[] = {0x5220d0804e40cc00, 0x4880c4003c008000};
16615 ASSERT_EQUAL_SVE(z3_expected, z3.VnD());
16616 uint64_t z4_expected[] = {0x7e007e007e007e00, 0x7e007e007e007e00};
16617 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
16618
Jacob Bramleydfb93b52020-07-02 12:06:45 +010016619 uint64_t z5_expected[] = {0xc180000041c80000, 0xc210000042440000};
Martyn Capewellefd9dc72020-02-13 10:46:29 +000016620 ASSERT_EQUAL_SVE(z5_expected, z5.VnD());
16621 uint64_t z6_expected[] = {0x7fc000007fc00000, 0x7fc000007fc00000};
16622 ASSERT_EQUAL_SVE(z6_expected, z6.VnD());
16623
16624 uint64_t z7_expected[] = {0x3ff0000000000000, 0xc010000000000000};
16625 ASSERT_EQUAL_SVE(z7_expected, z7.VnD());
16626 uint64_t z8_expected[] = {0x7ff8000000000000, 0x7ff8000000000000};
16627 ASSERT_EQUAL_SVE(z8_expected, z8.VnD());
16628 }
16629}
TatWai Chongf8d29f12020-02-16 22:53:18 -080016630
16631typedef void (MacroAssembler::*FPMulAccFn)(
16632 const ZRegister& zd,
16633 const PRegisterM& pg,
16634 const ZRegister& za,
16635 const ZRegister& zn,
16636 const ZRegister& zm,
16637 FPMacroNaNPropagationOption nan_option);
16638
16639// The `pg_inputs` is used for examining the predication correctness internally.
16640// It does not imply the value of `result` argument. `result` stands for the
16641// expected result on all-true predication.
16642template <typename T, size_t N>
16643static void FPMulAccHelper(
16644 Test* config,
16645 FPMulAccFn macro,
16646 unsigned lane_size_in_bits,
16647 const int (&pg_inputs)[N],
16648 const T (&za_inputs)[N],
16649 const T (&zn_inputs)[N],
16650 const T (&zm_inputs)[N],
16651 const uint64_t (&result)[N],
16652 FPMacroNaNPropagationOption nan_option = FastNaNPropagation) {
16653 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16654 START();
16655
16656 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
16657 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
16658 ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
16659 ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
16660
16661 uint64_t za_rawbits[N];
16662 uint64_t zn_rawbits[N];
16663 uint64_t zm_rawbits[N];
16664
16665 FPToRawbitsWithSize(za_inputs, za_rawbits, lane_size_in_bits);
16666 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
16667 FPToRawbitsWithSize(zm_inputs, zm_rawbits, lane_size_in_bits);
16668
16669 InsrHelper(&masm, za, za_rawbits);
16670 InsrHelper(&masm, zn, zn_rawbits);
16671 InsrHelper(&masm, zm, zm_rawbits);
16672
TatWai Chong2cb1b612020-03-04 23:51:21 -080016673 // Initialize `zd` with a signalling NaN.
16674 uint64_t sn = GetSignallingNan(lane_size_in_bits);
16675 __ Mov(x29, sn);
16676 __ Dup(zd, x29);
TatWai Chongf8d29f12020-02-16 22:53:18 -080016677
16678 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
16679
16680 // Fmla macro automatically selects between fmla, fmad and movprfx + fmla
16681 // Fmls `ditto` fmls, fmsb and movprfx + fmls
16682 // Fnmla `ditto` fnmla, fnmad and movprfx + fnmla
16683 // Fnmls `ditto` fnmls, fnmsb and movprfx + fnmls
16684 // based on what registers are aliased.
16685 ZRegister da_result = z10.WithLaneSize(lane_size_in_bits);
16686 ZRegister dn_result = z11.WithLaneSize(lane_size_in_bits);
16687 ZRegister dm_result = z12.WithLaneSize(lane_size_in_bits);
16688 ZRegister d_result = z13.WithLaneSize(lane_size_in_bits);
16689
16690 __ Mov(da_result, za);
16691 (masm.*macro)(da_result, p0.Merging(), da_result, zn, zm, nan_option);
16692
16693 __ Mov(dn_result, zn);
16694 (masm.*macro)(dn_result, p0.Merging(), za, dn_result, zm, nan_option);
16695
16696 __ Mov(dm_result, zm);
16697 (masm.*macro)(dm_result, p0.Merging(), za, zn, dm_result, nan_option);
16698
16699 __ Mov(d_result, zd);
16700 (masm.*macro)(d_result, p0.Merging(), za, zn, zm, nan_option);
16701
16702 END();
16703
16704 if (CAN_RUN()) {
16705 RUN();
16706
16707 ASSERT_EQUAL_SVE(za_rawbits, za);
16708 ASSERT_EQUAL_SVE(zn_rawbits, zn);
16709 ASSERT_EQUAL_SVE(zm_rawbits, zm);
16710
16711 uint64_t da_expected[N];
16712 uint64_t dn_expected[N];
16713 uint64_t dm_expected[N];
16714 uint64_t d_expected[N];
16715 for (size_t i = 0; i < N; i++) {
16716 da_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : za_rawbits[i];
16717 dn_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zn_rawbits[i];
16718 dm_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : zm_rawbits[i];
TatWai Chong2cb1b612020-03-04 23:51:21 -080016719 d_expected[i] = ((pg_inputs[i] & 1) != 0) ? result[i] : sn;
TatWai Chongf8d29f12020-02-16 22:53:18 -080016720 }
16721
16722 ASSERT_EQUAL_SVE(da_expected, da_result);
16723 ASSERT_EQUAL_SVE(dn_expected, dn_result);
16724 ASSERT_EQUAL_SVE(dm_expected, dm_result);
16725 ASSERT_EQUAL_SVE(d_expected, d_result);
16726 }
16727}
16728
16729TEST_SVE(sve_fmla_fmad) {
16730 // fmla : zd = za + zn * zm
16731 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16732 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16733 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16734 int pg_inputs[] = {1, 1, 0, 1};
16735
16736 uint64_t fmla_result_h[] = {Float16ToRawbits(Float16(-84.0)),
16737 Float16ToRawbits(Float16(101.0)),
16738 Float16ToRawbits(Float16(33.0)),
16739 Float16ToRawbits(Float16(42.0))};
16740
16741 // `fmad` has been tested in the helper.
16742 FPMulAccHelper(config,
16743 &MacroAssembler::Fmla,
16744 kHRegSize,
16745 pg_inputs,
16746 za_inputs,
16747 zn_inputs,
16748 zm_inputs,
16749 fmla_result_h);
16750
16751 uint64_t fmla_result_s[] = {FloatToRawbits(-84.0f),
16752 FloatToRawbits(101.0f),
16753 FloatToRawbits(33.0f),
16754 FloatToRawbits(42.0f)};
16755
16756 FPMulAccHelper(config,
16757 &MacroAssembler::Fmla,
16758 kSRegSize,
16759 pg_inputs,
16760 za_inputs,
16761 zn_inputs,
16762 zm_inputs,
16763 fmla_result_s);
16764
16765 uint64_t fmla_result_d[] = {DoubleToRawbits(-84.0),
16766 DoubleToRawbits(101.0),
16767 DoubleToRawbits(33.0),
16768 DoubleToRawbits(42.0)};
16769
16770 FPMulAccHelper(config,
16771 &MacroAssembler::Fmla,
16772 kDRegSize,
16773 pg_inputs,
16774 za_inputs,
16775 zn_inputs,
16776 zm_inputs,
16777 fmla_result_d);
16778}
16779
16780TEST_SVE(sve_fmls_fmsb) {
16781 // fmls : zd = za - zn * zm
16782 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16783 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16784 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16785 int pg_inputs[] = {1, 0, 1, 1};
16786
16787 uint64_t fmls_result_h[] = {Float16ToRawbits(Float16(6.0)),
16788 Float16ToRawbits(Float16(-99.0)),
16789 Float16ToRawbits(Float16(-39.0)),
16790 Float16ToRawbits(Float16(-38.0))};
16791
16792 // `fmsb` has been tested in the helper.
16793 FPMulAccHelper(config,
16794 &MacroAssembler::Fmls,
16795 kHRegSize,
16796 pg_inputs,
16797 za_inputs,
16798 zn_inputs,
16799 zm_inputs,
16800 fmls_result_h);
16801
16802 uint64_t fmls_result_s[] = {FloatToRawbits(6.0f),
16803 FloatToRawbits(-99.0f),
16804 FloatToRawbits(-39.0f),
16805 FloatToRawbits(-38.0f)};
16806
16807 FPMulAccHelper(config,
16808 &MacroAssembler::Fmls,
16809 kSRegSize,
16810 pg_inputs,
16811 za_inputs,
16812 zn_inputs,
16813 zm_inputs,
16814 fmls_result_s);
16815
16816 uint64_t fmls_result_d[] = {DoubleToRawbits(6.0),
16817 DoubleToRawbits(-99.0),
16818 DoubleToRawbits(-39.0),
16819 DoubleToRawbits(-38.0)};
16820
16821 FPMulAccHelper(config,
16822 &MacroAssembler::Fmls,
16823 kDRegSize,
16824 pg_inputs,
16825 za_inputs,
16826 zn_inputs,
16827 zm_inputs,
16828 fmls_result_d);
16829}
16830
16831TEST_SVE(sve_fnmla_fnmad) {
16832 // fnmla : zd = -za - zn * zm
16833 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16834 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16835 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16836 int pg_inputs[] = {0, 1, 1, 1};
16837
16838 uint64_t fnmla_result_h[] = {Float16ToRawbits(Float16(84.0)),
16839 Float16ToRawbits(Float16(-101.0)),
16840 Float16ToRawbits(Float16(-33.0)),
16841 Float16ToRawbits(Float16(-42.0))};
16842
16843 // `fnmad` has been tested in the helper.
16844 FPMulAccHelper(config,
16845 &MacroAssembler::Fnmla,
16846 kHRegSize,
16847 pg_inputs,
16848 za_inputs,
16849 zn_inputs,
16850 zm_inputs,
16851 fnmla_result_h);
16852
16853 uint64_t fnmla_result_s[] = {FloatToRawbits(84.0f),
16854 FloatToRawbits(-101.0f),
16855 FloatToRawbits(-33.0f),
16856 FloatToRawbits(-42.0f)};
16857
16858 FPMulAccHelper(config,
16859 &MacroAssembler::Fnmla,
16860 kSRegSize,
16861 pg_inputs,
16862 za_inputs,
16863 zn_inputs,
16864 zm_inputs,
16865 fnmla_result_s);
16866
16867 uint64_t fnmla_result_d[] = {DoubleToRawbits(84.0),
16868 DoubleToRawbits(-101.0),
16869 DoubleToRawbits(-33.0),
16870 DoubleToRawbits(-42.0)};
16871
16872 FPMulAccHelper(config,
16873 &MacroAssembler::Fnmla,
16874 kDRegSize,
16875 pg_inputs,
16876 za_inputs,
16877 zn_inputs,
16878 zm_inputs,
16879 fnmla_result_d);
16880}
16881
16882TEST_SVE(sve_fnmls_fnmsb) {
16883 // fnmls : zd = -za + zn * zm
16884 double za_inputs[] = {-39.0, 1.0, -3.0, 2.0};
16885 double zn_inputs[] = {-5.0, -20.0, 9.0, 8.0};
16886 double zm_inputs[] = {9.0, -5.0, 4.0, 5.0};
16887 int pg_inputs[] = {1, 1, 1, 0};
16888
16889 uint64_t fnmls_result_h[] = {Float16ToRawbits(Float16(-6.0)),
16890 Float16ToRawbits(Float16(99.0)),
16891 Float16ToRawbits(Float16(39.0)),
16892 Float16ToRawbits(Float16(38.0))};
16893
16894 // `fnmsb` has been tested in the helper.
16895 FPMulAccHelper(config,
16896 &MacroAssembler::Fnmls,
16897 kHRegSize,
16898 pg_inputs,
16899 za_inputs,
16900 zn_inputs,
16901 zm_inputs,
16902 fnmls_result_h);
16903
16904 uint64_t fnmls_result_s[] = {FloatToRawbits(-6.0f),
16905 FloatToRawbits(99.0f),
16906 FloatToRawbits(39.0f),
16907 FloatToRawbits(38.0f)};
16908
16909 FPMulAccHelper(config,
16910 &MacroAssembler::Fnmls,
16911 kSRegSize,
16912 pg_inputs,
16913 za_inputs,
16914 zn_inputs,
16915 zm_inputs,
16916 fnmls_result_s);
16917
16918 uint64_t fnmls_result_d[] = {DoubleToRawbits(-6.0),
16919 DoubleToRawbits(99.0),
16920 DoubleToRawbits(39.0),
16921 DoubleToRawbits(38.0)};
16922
16923 FPMulAccHelper(config,
16924 &MacroAssembler::Fnmls,
16925 kDRegSize,
16926 pg_inputs,
16927 za_inputs,
16928 zn_inputs,
16929 zm_inputs,
16930 fnmls_result_d);
16931}
16932
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016933typedef void (MacroAssembler::*FPMulAccIdxFn)(const ZRegister& zd,
16934 const ZRegister& za,
16935 const ZRegister& zn,
16936 const ZRegister& zm,
16937 int index);
16938
16939template <typename T, size_t N>
16940static void FPMulAccIdxHelper(Test* config,
16941 FPMulAccFn macro,
16942 FPMulAccIdxFn macro_idx,
16943 const T (&za_inputs)[N],
16944 const T (&zn_inputs)[N],
16945 const T (&zm_inputs)[N]) {
16946 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
16947 START();
16948
Martyn Capewellc7501512020-03-16 10:35:33 +000016949 __ Ptrue(p0.VnB());
16950
16951 // Repeat indexed vector across up to 2048-bit VL.
16952 for (size_t i = 0; i < (kZRegMaxSize / kDRegSize); i += N) {
16953 InsrHelper(&masm, z30.VnD(), zm_inputs);
16954 }
16955
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070016956 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z30.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000016957
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016958 InsrHelper(&masm, z1.VnD(), zn_inputs);
16959 InsrHelper(&masm, z2.VnD(), za_inputs);
16960
16961 __ Mov(z3, z0);
16962 (masm.*macro_idx)(z3.VnH(), z2.VnH(), z1.VnH(), z3.VnH(), 0); // zd == zm
16963 __ Mov(z4, z1);
16964 (masm.*macro_idx)(z4.VnH(), z2.VnH(), z4.VnH(), z0.VnH(), 1); // zd == zn
16965 __ Mov(z5, z2);
16966 (masm.*macro_idx)(z5.VnH(), z5.VnH(), z1.VnH(), z0.VnH(), 4); // zd == za
16967 (masm.*macro_idx)(z6.VnH(), z2.VnH(), z1.VnH(), z0.VnH(), 7);
16968
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070016969 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z30.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000016970
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016971 __ Mov(z7, z0);
16972 (masm.*macro_idx)(z7.VnS(), z2.VnS(), z1.VnS(), z7.VnS(), 0); // zd == zm
16973 __ Mov(z8, z1);
16974 (masm.*macro_idx)(z8.VnS(), z2.VnS(), z8.VnS(), z0.VnS(), 1); // zd == zn
16975 __ Mov(z9, z2);
16976 (masm.*macro_idx)(z9.VnS(), z9.VnS(), z1.VnS(), z0.VnS(), 2); // zd == za
16977 (masm.*macro_idx)(z10.VnS(), z2.VnS(), z1.VnS(), z0.VnS(), 3);
16978
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070016979 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
Martyn Capewellc7501512020-03-16 10:35:33 +000016980
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016981 __ Mov(z11, z0);
16982 (masm.*macro_idx)(z11.VnD(), z2.VnD(), z1.VnD(), z11.VnD(), 0); // zd == zm
16983 __ Mov(z12, z1);
16984 (masm.*macro_idx)(z12.VnD(), z2.VnD(), z12.VnD(), z0.VnD(), 1); // zd == zn
16985 __ Mov(z13, z2);
16986 (masm.*macro_idx)(z13.VnD(), z13.VnD(), z1.VnD(), z0.VnD(), 0); // zd == za
16987 __ Mov(z14, z0);
16988 // zd == zn == zm
16989 (masm.*macro_idx)(z14.VnD(), z2.VnD(), z14.VnD(), z14.VnD(), 1);
16990
TatWai Chonga2c1bb72020-02-16 23:16:47 -080016991 // Indexed form of Fmla and Fmls won't swap argument, passing strict NaN
16992 // propagation mode to ensure the following macros don't swap argument in
16993 // any cases.
16994 FPMacroNaNPropagationOption option = StrictNaNPropagation;
16995 // Compute the results using other instructions.
Martyn Capewellc7501512020-03-16 10:35:33 +000016996 __ Dup(z0.VnH(), z30.VnH(), 0);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070016997 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000016998 (masm.*macro)(z15.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
16999 __ Dup(z0.VnH(), z30.VnH(), 1);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017000 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017001 (masm.*macro)(z16.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17002 __ Dup(z0.VnH(), z30.VnH(), 4);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017003 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017004 (masm.*macro)(z17.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
17005 __ Dup(z0.VnH(), z30.VnH(), 7);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017006 FPSegmentPatternHelper(&masm, z0.VnH(), p0.Merging(), z0.VnH());
Martyn Capewellc7501512020-03-16 10:35:33 +000017007 (masm.*macro)(z18.VnH(), p0.Merging(), z2.VnH(), z1.VnH(), z0.VnH(), option);
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017008
Martyn Capewellc7501512020-03-16 10:35:33 +000017009 __ Dup(z0.VnS(), z30.VnS(), 0);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017010 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017011 (masm.*macro)(z19.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17012 __ Dup(z0.VnS(), z30.VnS(), 1);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017013 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017014 (masm.*macro)(z20.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17015 __ Dup(z0.VnS(), z30.VnS(), 2);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017016 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017017 (masm.*macro)(z21.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
17018 __ Dup(z0.VnS(), z30.VnS(), 3);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017019 FPSegmentPatternHelper(&masm, z0.VnS(), p0.Merging(), z0.VnS());
Martyn Capewellc7501512020-03-16 10:35:33 +000017020 (masm.*macro)(z22.VnS(), p0.Merging(), z2.VnS(), z1.VnS(), z0.VnS(), option);
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017021
Martyn Capewellc7501512020-03-16 10:35:33 +000017022 __ Dup(z0.VnD(), z30.VnD(), 0);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017023 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
Martyn Capewellc7501512020-03-16 10:35:33 +000017024 (masm.*macro)(z23.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
17025 __ Dup(z0.VnD(), z30.VnD(), 1);
TatWai Chongfa3f6bf2020-03-13 00:22:03 -070017026 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z0.VnD());
Martyn Capewellc7501512020-03-16 10:35:33 +000017027 (masm.*macro)(z24.VnD(), p0.Merging(), z2.VnD(), z1.VnD(), z0.VnD(), option);
Jacob Bramley8caa8732020-07-01 20:22:38 +010017028 FPSegmentPatternHelper(&masm, z0.VnD(), p0.Merging(), z30.VnD());
17029 __ Dup(z29.VnD(), z30.VnD(), 1);
17030 FPSegmentPatternHelper(&masm, z29.VnD(), p0.Merging(), z29.VnD());
17031 (masm.*macro)(z25.VnD(), p0.Merging(), z2.VnD(), z0.VnD(), z29.VnD(), option);
TatWai Chonga2c1bb72020-02-16 23:16:47 -080017032
17033 END();
17034
17035 if (CAN_RUN()) {
17036 RUN();
17037
17038 ASSERT_EQUAL_SVE(z15.VnH(), z3.VnH());
17039 ASSERT_EQUAL_SVE(z16.VnH(), z4.VnH());
17040 ASSERT_EQUAL_SVE(z17.VnH(), z5.VnH());
17041 ASSERT_EQUAL_SVE(z18.VnH(), z6.VnH());
17042
17043 ASSERT_EQUAL_SVE(z19.VnS(), z7.VnS());
17044 ASSERT_EQUAL_SVE(z20.VnS(), z8.VnS());
17045 ASSERT_EQUAL_SVE(z21.VnS(), z9.VnS());
17046 ASSERT_EQUAL_SVE(z22.VnS(), z10.VnS());
17047
17048 ASSERT_EQUAL_SVE(z23.VnD(), z11.VnD());
17049 ASSERT_EQUAL_SVE(z24.VnD(), z12.VnD());
17050 ASSERT_EQUAL_SVE(z11.VnD(), z13.VnD());
17051 ASSERT_EQUAL_SVE(z25.VnD(), z14.VnD());
17052 }
17053}
17054
17055TEST_SVE(sve_fmla_fmls_index) {
17056 uint64_t zm_inputs_1[] = {0x3ff000003f803c00, 0xbff00000bf80bc00};
17057 uint64_t zn_inputs_1[] = {0x3ff012343ff03c76, 0xbff01234bff0bc76};
17058 uint64_t za_inputs_1[] = {0x3c004000bc00c000, 0x64006800e400e800};
17059
17060 // Using the vector form of Fmla and Fmls to verify the indexed form.
17061 FPMulAccIdxHelper(config,
17062 &MacroAssembler::Fmla, // vector form
17063 &MacroAssembler::Fmla, // indexed form
17064 za_inputs_1,
17065 zn_inputs_1,
17066 zm_inputs_1);
17067
17068 FPMulAccIdxHelper(config,
17069 &MacroAssembler::Fmls, // vector form
17070 &MacroAssembler::Fmls, // indexed form
17071 za_inputs_1,
17072 zn_inputs_1,
17073 zm_inputs_1);
17074
17075 uint64_t zm_inputs_2[] = {0x7ff5555511111111, // NaN
17076 0xfff0000000000000}; // Infinity
17077 uint64_t zn_inputs_2[] = {0x7f9511117fc00000, // NaN
17078 0x7f800000ff800000}; // Infinity
17079 uint64_t za_inputs_2[] = {0x7c11000000007e00, // NaN
17080 0x000000007c00fc00}; // Infinity
17081 FPMulAccIdxHelper(config,
17082 &MacroAssembler::Fmla, // vector form
17083 &MacroAssembler::Fmla, // indexed form
17084 za_inputs_2,
17085 zn_inputs_2,
17086 zm_inputs_2);
17087
17088 FPMulAccIdxHelper(config,
17089 &MacroAssembler::Fmls, // vector form
17090 &MacroAssembler::Fmls, // indexed form
17091 za_inputs_2,
17092 zn_inputs_2,
17093 zm_inputs_2);
17094}
17095
TatWai Chongf8d29f12020-02-16 22:53:18 -080017096// Execute a number of instructions which all use ProcessNaNs, and check that
17097// they all propagate NaNs correctly.
17098template <typename Ti, typename Td, size_t N>
17099static void ProcessNaNsHelper(Test* config,
17100 int lane_size_in_bits,
17101 const Ti (&zn_inputs)[N],
17102 const Ti (&zm_inputs)[N],
17103 const Td (&zd_expected)[N],
17104 FPMacroNaNPropagationOption nan_option) {
17105 ArithFn arith_unpredicated_macro[] = {&MacroAssembler::Fadd,
17106 &MacroAssembler::Fsub,
17107 &MacroAssembler::Fmul};
17108
17109 for (size_t i = 0; i < ArrayLength(arith_unpredicated_macro); i++) {
17110 FPBinArithHelper(config,
17111 arith_unpredicated_macro[i],
17112 lane_size_in_bits,
17113 zn_inputs,
17114 zm_inputs,
17115 zd_expected);
17116 }
17117
17118 FPArithPredicatedFn arith_predicated_macro[] = {&MacroAssembler::Fmax,
17119 &MacroAssembler::Fmin};
17120 int pg_inputs[N];
17121 // With an all-true predicate, this helper aims to compare with special
17122 // numbers.
17123 for (size_t i = 0; i < N; i++) {
17124 pg_inputs[i] = 1;
17125 }
17126
17127 // fdivr propagates the quotient (Zm) preferentially, so we don't actually
17128 // need any special handling for StrictNaNPropagation.
17129 FPBinArithHelper(config,
17130 NULL,
17131 &MacroAssembler::Fdiv,
17132 lane_size_in_bits,
17133 // With an all-true predicate, the value in zd is
17134 // irrelevant to the operations.
17135 zn_inputs,
17136 pg_inputs,
17137 zn_inputs,
17138 zm_inputs,
17139 zd_expected);
17140
17141 for (size_t i = 0; i < ArrayLength(arith_predicated_macro); i++) {
17142 FPBinArithHelper(config,
17143 arith_predicated_macro[i],
17144 NULL,
17145 lane_size_in_bits,
17146 // With an all-true predicate, the value in zd is
17147 // irrelevant to the operations.
17148 zn_inputs,
17149 pg_inputs,
17150 zn_inputs,
17151 zm_inputs,
17152 zd_expected,
17153 nan_option);
17154 }
17155}
17156
17157template <typename Ti, typename Td, size_t N>
17158static void ProcessNaNsHelper3(Test* config,
17159 int lane_size_in_bits,
17160 const Ti (&za_inputs)[N],
17161 const Ti (&zn_inputs)[N],
17162 const Ti (&zm_inputs)[N],
17163 const Td (&zd_expected_fmla)[N],
17164 const Td (&zd_expected_fmls)[N],
17165 const Td (&zd_expected_fnmla)[N],
17166 const Td (&zd_expected_fnmls)[N],
17167 FPMacroNaNPropagationOption nan_option) {
17168 int pg_inputs[N];
17169 // With an all-true predicate, this helper aims to compare with special
17170 // numbers.
17171 for (size_t i = 0; i < N; i++) {
17172 pg_inputs[i] = 1;
17173 }
17174
17175 FPMulAccHelper(config,
17176 &MacroAssembler::Fmla,
17177 lane_size_in_bits,
17178 pg_inputs,
17179 za_inputs,
17180 zn_inputs,
17181 zm_inputs,
17182 zd_expected_fmla,
17183 nan_option);
17184
17185 FPMulAccHelper(config,
17186 &MacroAssembler::Fmls,
17187 lane_size_in_bits,
17188 pg_inputs,
17189 za_inputs,
17190 zn_inputs,
17191 zm_inputs,
17192 zd_expected_fmls,
17193 nan_option);
17194
17195 FPMulAccHelper(config,
17196 &MacroAssembler::Fnmla,
17197 lane_size_in_bits,
17198 pg_inputs,
17199 za_inputs,
17200 zn_inputs,
17201 zm_inputs,
17202 zd_expected_fnmla,
17203 nan_option);
17204
17205 FPMulAccHelper(config,
17206 &MacroAssembler::Fnmls,
17207 lane_size_in_bits,
17208 pg_inputs,
17209 za_inputs,
17210 zn_inputs,
17211 zm_inputs,
17212 zd_expected_fnmls,
17213 nan_option);
17214}
17215
17216TEST_SVE(sve_process_nans_double) {
17217 // Use non-standard NaNs to check that the payload bits are preserved.
17218 double sa = RawbitsToDouble(0x7ff5555511111111);
17219 double sn = RawbitsToDouble(0x7ff5555522222222);
17220 double sm = RawbitsToDouble(0x7ff5555533333333);
17221 double qa = RawbitsToDouble(0x7ffaaaaa11111111);
17222 double qn = RawbitsToDouble(0x7ffaaaaa22222222);
17223 double qm = RawbitsToDouble(0x7ffaaaaa33333333);
17224 VIXL_ASSERT(IsSignallingNaN(sa));
17225 VIXL_ASSERT(IsSignallingNaN(sn));
17226 VIXL_ASSERT(IsSignallingNaN(sm));
17227 VIXL_ASSERT(IsQuietNaN(qa));
17228 VIXL_ASSERT(IsQuietNaN(qn));
17229 VIXL_ASSERT(IsQuietNaN(qm));
17230
17231 // The input NaNs after passing through ProcessNaN.
17232 uint64_t sa_proc = 0x7ffd555511111111;
17233 uint64_t sn_proc = 0x7ffd555522222222;
17234 uint64_t sm_proc = 0x7ffd555533333333;
17235 uint64_t qa_proc = DoubleToRawbits(qa);
17236 uint64_t qn_proc = DoubleToRawbits(qn);
17237 uint64_t qm_proc = DoubleToRawbits(qm);
17238 uint64_t sa_proc_n = sa_proc ^ kDSignMask;
17239 uint64_t sn_proc_n = sn_proc ^ kDSignMask;
17240 uint64_t qa_proc_n = qa_proc ^ kDSignMask;
17241 uint64_t qn_proc_n = qn_proc ^ kDSignMask;
17242
17243 // Quiet NaNs are propagated.
17244 double zn_inputs_1[] = {qn, 0.0, 0.0, qm, qn, qm};
17245 double zm_inputs_1[] = {0.0, qn, qm, 0.0, qm, qn};
17246 uint64_t zd_expected_1[] =
17247 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17248
17249 ProcessNaNsHelper(config,
17250 kDRegSize,
17251 zn_inputs_1,
17252 zm_inputs_1,
17253 zd_expected_1,
17254 StrictNaNPropagation);
17255
17256 // Signalling NaNs are propagated.
17257 double zn_inputs_2[] = {sn, 0.0, 0.0, sm, sn, sm};
17258 double zm_inputs_2[] = {0.0, sn, sm, 0.0, sm, sn};
17259 uint64_t zd_expected_2[] =
17260 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17261 ProcessNaNsHelper(config,
17262 kDRegSize,
17263 zn_inputs_2,
17264 zm_inputs_2,
17265 zd_expected_2,
17266 StrictNaNPropagation);
17267
17268 // Signalling NaNs take precedence over quiet NaNs.
17269 double zn_inputs_3[] = {sn, qn, sn, sn, qn};
17270 double zm_inputs_3[] = {qm, sm, sm, qn, sn};
17271 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17272 ProcessNaNsHelper(config,
17273 kDRegSize,
17274 zn_inputs_3,
17275 zm_inputs_3,
17276 zd_expected_3,
17277 StrictNaNPropagation);
17278
17279 double za_inputs_4[] = {qa, qa, 0.0, 0.0, qa, qa};
17280 double zn_inputs_4[] = {qn, 0.0, 0.0, qn, qn, qn};
17281 double zm_inputs_4[] = {0.0, qm, qm, qm, qm, 0.0};
17282
17283 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17284 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17285 // If `m` is propagated, its sign is never inverted.
17286 uint64_t zd_expected_fmla_4[] =
17287 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17288 uint64_t zd_expected_fmls_4[] =
17289 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17290 uint64_t zd_expected_fnmla_4[] =
17291 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17292 uint64_t zd_expected_fnmls_4[] =
17293 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17294
17295 ProcessNaNsHelper3(config,
17296 kDRegSize,
17297 za_inputs_4,
17298 zn_inputs_4,
17299 zm_inputs_4,
17300 zd_expected_fmla_4,
17301 zd_expected_fmls_4,
17302 zd_expected_fnmla_4,
17303 zd_expected_fnmls_4,
17304 StrictNaNPropagation);
17305
17306 // Signalling NaNs take precedence over quiet NaNs.
17307 double za_inputs_5[] = {qa, qa, sa, sa, sa};
17308 double zn_inputs_5[] = {qn, sn, sn, sn, qn};
17309 double zm_inputs_5[] = {sm, qm, sm, qa, sm};
17310 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17311 uint64_t zd_expected_fmls_5[] = {sm_proc,
17312 sn_proc_n,
17313 sa_proc,
17314 sa_proc,
17315 sa_proc};
17316 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17317 sn_proc_n,
17318 sa_proc_n,
17319 sa_proc_n,
17320 sa_proc_n};
17321 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17322 sn_proc,
17323 sa_proc_n,
17324 sa_proc_n,
17325 sa_proc_n};
17326
17327 ProcessNaNsHelper3(config,
17328 kDRegSize,
17329 za_inputs_5,
17330 zn_inputs_5,
17331 zm_inputs_5,
17332 zd_expected_fmla_5,
17333 zd_expected_fmls_5,
17334 zd_expected_fnmla_5,
17335 zd_expected_fnmls_5,
17336 StrictNaNPropagation);
17337
17338 const double inf = kFP64PositiveInfinity;
17339 const double inf_n = kFP64NegativeInfinity;
17340 uint64_t inf_proc = DoubleToRawbits(inf);
17341 uint64_t inf_proc_n = DoubleToRawbits(inf_n);
17342 uint64_t d_inf_proc = DoubleToRawbits(kFP64DefaultNaN);
17343
17344 double za_inputs_6[] = {qa, qa, 0.0f, -0.0f, qa, sa};
17345 double zn_inputs_6[] = {inf, -0.0f, -0.0f, inf, inf_n, inf};
17346 double zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17347
17348 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17349 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17350 // quiet_nan.
17351 uint64_t zd_expected_fmla_6[] =
17352 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17353 uint64_t zd_expected_fmls_6[] =
17354 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17355 uint64_t zd_expected_fnmla_6[] =
17356 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17357 uint64_t zd_expected_fnmls_6[] =
17358 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17359
17360 ProcessNaNsHelper3(config,
17361 kDRegSize,
17362 za_inputs_6,
17363 zn_inputs_6,
17364 zm_inputs_6,
17365 zd_expected_fmla_6,
17366 zd_expected_fmls_6,
17367 zd_expected_fnmla_6,
17368 zd_expected_fnmls_6,
17369 StrictNaNPropagation);
17370}
17371
17372TEST_SVE(sve_process_nans_float) {
17373 // Use non-standard NaNs to check that the payload bits are preserved.
17374 float sa = RawbitsToFloat(0x7f951111);
17375 float sn = RawbitsToFloat(0x7f952222);
17376 float sm = RawbitsToFloat(0x7f953333);
17377 float qa = RawbitsToFloat(0x7fea1111);
17378 float qn = RawbitsToFloat(0x7fea2222);
17379 float qm = RawbitsToFloat(0x7fea3333);
17380 VIXL_ASSERT(IsSignallingNaN(sa));
17381 VIXL_ASSERT(IsSignallingNaN(sn));
17382 VIXL_ASSERT(IsSignallingNaN(sm));
17383 VIXL_ASSERT(IsQuietNaN(qa));
17384 VIXL_ASSERT(IsQuietNaN(qn));
17385 VIXL_ASSERT(IsQuietNaN(qm));
17386
17387 // The input NaNs after passing through ProcessNaN.
17388 uint32_t sa_proc = 0x7fd51111;
17389 uint32_t sn_proc = 0x7fd52222;
17390 uint32_t sm_proc = 0x7fd53333;
17391 uint32_t qa_proc = FloatToRawbits(qa);
17392 uint32_t qn_proc = FloatToRawbits(qn);
17393 uint32_t qm_proc = FloatToRawbits(qm);
17394 uint32_t sa_proc_n = sa_proc ^ kSSignMask;
17395 uint32_t sn_proc_n = sn_proc ^ kSSignMask;
17396 uint32_t qa_proc_n = qa_proc ^ kSSignMask;
17397 uint32_t qn_proc_n = qn_proc ^ kSSignMask;
17398
17399 // Quiet NaNs are propagated.
17400 float zn_inputs_1[] = {qn, 0.0f, 0.0f, qm, qn, qm};
17401 float zm_inputs_1[] = {0.0f, qn, qm, 0.0f, qm, qn};
17402 uint64_t zd_expected_1[] =
17403 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17404
17405 ProcessNaNsHelper(config,
17406 kSRegSize,
17407 zn_inputs_1,
17408 zm_inputs_1,
17409 zd_expected_1,
17410 StrictNaNPropagation);
17411
17412 // Signalling NaNs are propagated.
17413 float zn_inputs_2[] = {sn, 0.0f, 0.0f, sm, sn, sm};
17414 float zm_inputs_2[] = {0.0f, sn, sm, 0.0f, sm, sn};
17415 uint64_t zd_expected_2[] =
17416 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17417 ProcessNaNsHelper(config,
17418 kSRegSize,
17419 zn_inputs_2,
17420 zm_inputs_2,
17421 zd_expected_2,
17422 StrictNaNPropagation);
17423
17424 // Signalling NaNs take precedence over quiet NaNs.
17425 float zn_inputs_3[] = {sn, qn, sn, sn, qn};
17426 float zm_inputs_3[] = {qm, sm, sm, qn, sn};
17427 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17428 ProcessNaNsHelper(config,
17429 kSRegSize,
17430 zn_inputs_3,
17431 zm_inputs_3,
17432 zd_expected_3,
17433 StrictNaNPropagation);
17434
17435 float za_inputs_4[] = {qa, qa, 0.0f, 0.0f, qa, qa};
17436 float zn_inputs_4[] = {qn, 0.0f, 0.0f, qn, qn, qn};
17437 float zm_inputs_4[] = {0.0f, qm, qm, qm, qm, 0.0f};
17438
17439 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17440 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17441 // If `m` is propagated, its sign is never inverted.
17442 uint64_t zd_expected_fmla_4[] =
17443 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17444 uint64_t zd_expected_fmls_4[] =
17445 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17446 uint64_t zd_expected_fnmla_4[] =
17447 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17448 uint64_t zd_expected_fnmls_4[] =
17449 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17450
17451 ProcessNaNsHelper3(config,
17452 kSRegSize,
17453 za_inputs_4,
17454 zn_inputs_4,
17455 zm_inputs_4,
17456 zd_expected_fmla_4,
17457 zd_expected_fmls_4,
17458 zd_expected_fnmla_4,
17459 zd_expected_fnmls_4,
17460 StrictNaNPropagation);
17461
17462 // Signalling NaNs take precedence over quiet NaNs.
17463 float za_inputs_5[] = {qa, qa, sa, sa, sa};
17464 float zn_inputs_5[] = {qn, sn, sn, sn, qn};
17465 float zm_inputs_5[] = {sm, qm, sm, qa, sm};
17466 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17467 uint64_t zd_expected_fmls_5[] = {sm_proc,
17468 sn_proc_n,
17469 sa_proc,
17470 sa_proc,
17471 sa_proc};
17472 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17473 sn_proc_n,
17474 sa_proc_n,
17475 sa_proc_n,
17476 sa_proc_n};
17477 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17478 sn_proc,
17479 sa_proc_n,
17480 sa_proc_n,
17481 sa_proc_n};
17482
17483 ProcessNaNsHelper3(config,
17484 kSRegSize,
17485 za_inputs_5,
17486 zn_inputs_5,
17487 zm_inputs_5,
17488 zd_expected_fmla_5,
17489 zd_expected_fmls_5,
17490 zd_expected_fnmla_5,
17491 zd_expected_fnmls_5,
17492 StrictNaNPropagation);
17493
17494 const float inf = kFP32PositiveInfinity;
17495 const float inf_n = kFP32NegativeInfinity;
17496 uint32_t inf_proc = FloatToRawbits(inf);
17497 uint32_t inf_proc_n = FloatToRawbits(inf_n);
17498 uint32_t d_inf_proc = FloatToRawbits(kFP32DefaultNaN);
17499
17500 float za_inputs_6[] = {qa, qa, 0.0f, 0.0f, qa, sa};
17501 float zn_inputs_6[] = {inf, 0.0f, 0.0f, inf, inf_n, inf};
17502 float zm_inputs_6[] = {0.0f, inf_n, inf, inf, inf, 0.0f};
17503
17504 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17505 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17506 // quiet_nan.
17507 uint64_t zd_expected_fmla_6[] =
17508 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17509 uint64_t zd_expected_fmls_6[] =
17510 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17511 uint64_t zd_expected_fnmla_6[] =
17512 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17513 uint64_t zd_expected_fnmls_6[] =
17514 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17515
17516 ProcessNaNsHelper3(config,
17517 kSRegSize,
17518 za_inputs_6,
17519 zn_inputs_6,
17520 zm_inputs_6,
17521 zd_expected_fmla_6,
17522 zd_expected_fmls_6,
17523 zd_expected_fnmla_6,
17524 zd_expected_fnmls_6,
17525 StrictNaNPropagation);
17526}
17527
17528TEST_SVE(sve_process_nans_half) {
17529 // Use non-standard NaNs to check that the payload bits are preserved.
17530 Float16 sa(RawbitsToFloat16(0x7c11));
17531 Float16 sn(RawbitsToFloat16(0x7c22));
17532 Float16 sm(RawbitsToFloat16(0x7c33));
17533 Float16 qa(RawbitsToFloat16(0x7e44));
17534 Float16 qn(RawbitsToFloat16(0x7e55));
17535 Float16 qm(RawbitsToFloat16(0x7e66));
17536 VIXL_ASSERT(IsSignallingNaN(sa));
17537 VIXL_ASSERT(IsSignallingNaN(sn));
17538 VIXL_ASSERT(IsSignallingNaN(sm));
17539 VIXL_ASSERT(IsQuietNaN(qa));
17540 VIXL_ASSERT(IsQuietNaN(qn));
17541 VIXL_ASSERT(IsQuietNaN(qm));
17542
17543 // The input NaNs after passing through ProcessNaN.
17544 uint16_t sa_proc = 0x7e11;
17545 uint16_t sn_proc = 0x7e22;
17546 uint16_t sm_proc = 0x7e33;
17547 uint16_t qa_proc = Float16ToRawbits(qa);
17548 uint16_t qn_proc = Float16ToRawbits(qn);
17549 uint16_t qm_proc = Float16ToRawbits(qm);
17550 uint16_t sa_proc_n = sa_proc ^ kHSignMask;
17551 uint16_t sn_proc_n = sn_proc ^ kHSignMask;
17552 uint16_t qa_proc_n = qa_proc ^ kHSignMask;
17553 uint16_t qn_proc_n = qn_proc ^ kHSignMask;
17554 Float16 zero(0.0);
17555
17556 // Quiet NaNs are propagated.
17557 Float16 zn_inputs_1[] = {qn, zero, zero, qm, qn, qm};
17558 Float16 zm_inputs_1[] = {zero, qn, qm, zero, qm, qn};
17559 uint64_t zd_expected_1[] =
17560 {qn_proc, qn_proc, qm_proc, qm_proc, qn_proc, qm_proc};
17561
17562 ProcessNaNsHelper(config,
17563 kHRegSize,
17564 zn_inputs_1,
17565 zm_inputs_1,
17566 zd_expected_1,
17567 StrictNaNPropagation);
17568
17569 // Signalling NaNs are propagated.
17570 Float16 zn_inputs_2[] = {sn, zero, zero, sm, sn, sm};
17571 Float16 zm_inputs_2[] = {zero, sn, sm, zero, sm, sn};
17572 uint64_t zd_expected_2[] =
17573 {sn_proc, sn_proc, sm_proc, sm_proc, sn_proc, sm_proc};
17574 ProcessNaNsHelper(config,
17575 kHRegSize,
17576 zn_inputs_2,
17577 zm_inputs_2,
17578 zd_expected_2,
17579 StrictNaNPropagation);
17580
17581 // Signalling NaNs take precedence over quiet NaNs.
17582 Float16 zn_inputs_3[] = {sn, qn, sn, sn, qn};
17583 Float16 zm_inputs_3[] = {qm, sm, sm, qn, sn};
17584 uint64_t zd_expected_3[] = {sn_proc, sm_proc, sn_proc, sn_proc, sn_proc};
17585 ProcessNaNsHelper(config,
17586 kHRegSize,
17587 zn_inputs_3,
17588 zm_inputs_3,
17589 zd_expected_3,
17590 StrictNaNPropagation);
17591
17592 Float16 za_inputs_4[] = {qa, qa, zero, zero, qa, qa};
17593 Float16 zn_inputs_4[] = {qn, zero, zero, qn, qn, qn};
17594 Float16 zm_inputs_4[] = {zero, qm, qm, qm, qm, zero};
17595
17596 // If `a` is propagated, its sign is inverted by fnmla and fnmls.
17597 // If `n` is propagated, its sign is inverted by fmls and fnmla.
17598 // If `m` is propagated, its sign is never inverted.
17599 uint64_t zd_expected_fmla_4[] =
17600 {qa_proc, qa_proc, qm_proc, qn_proc, qa_proc, qa_proc};
17601 uint64_t zd_expected_fmls_4[] =
17602 {qa_proc, qa_proc, qm_proc, qn_proc_n, qa_proc, qa_proc};
17603 uint64_t zd_expected_fnmla_4[] =
17604 {qa_proc_n, qa_proc_n, qm_proc, qn_proc_n, qa_proc_n, qa_proc_n};
17605 uint64_t zd_expected_fnmls_4[] =
17606 {qa_proc_n, qa_proc_n, qm_proc, qn_proc, qa_proc_n, qa_proc_n};
17607
17608 ProcessNaNsHelper3(config,
17609 kHRegSize,
17610 za_inputs_4,
17611 zn_inputs_4,
17612 zm_inputs_4,
17613 zd_expected_fmla_4,
17614 zd_expected_fmls_4,
17615 zd_expected_fnmla_4,
17616 zd_expected_fnmls_4,
17617 StrictNaNPropagation);
17618
17619 // Signalling NaNs take precedence over quiet NaNs.
17620 Float16 za_inputs_5[] = {qa, qa, sa, sa, sa};
17621 Float16 zn_inputs_5[] = {qn, sn, sn, sn, qn};
17622 Float16 zm_inputs_5[] = {sm, qm, sm, qa, sm};
17623 uint64_t zd_expected_fmla_5[] = {sm_proc, sn_proc, sa_proc, sa_proc, sa_proc};
17624 uint64_t zd_expected_fmls_5[] = {sm_proc,
17625 sn_proc_n,
17626 sa_proc,
17627 sa_proc,
17628 sa_proc};
17629 uint64_t zd_expected_fnmla_5[] = {sm_proc,
17630 sn_proc_n,
17631 sa_proc_n,
17632 sa_proc_n,
17633 sa_proc_n};
17634 uint64_t zd_expected_fnmls_5[] = {sm_proc,
17635 sn_proc,
17636 sa_proc_n,
17637 sa_proc_n,
17638 sa_proc_n};
17639
17640 ProcessNaNsHelper3(config,
17641 kHRegSize,
17642 za_inputs_5,
17643 zn_inputs_5,
17644 zm_inputs_5,
17645 zd_expected_fmla_5,
17646 zd_expected_fmls_5,
17647 zd_expected_fnmla_5,
17648 zd_expected_fnmls_5,
17649 StrictNaNPropagation);
17650
17651 const Float16 inf = kFP16PositiveInfinity;
17652 const Float16 inf_n = kFP16NegativeInfinity;
17653 uint64_t inf_proc = Float16ToRawbits(inf);
17654 uint64_t inf_proc_n = Float16ToRawbits(inf_n);
17655 uint64_t d_inf_proc = Float16ToRawbits(kFP16DefaultNaN);
17656
17657 Float16 za_inputs_6[] = {qa, qa, zero, zero, qa, sa};
17658 Float16 zn_inputs_6[] = {inf, zero, zero, inf, inf_n, inf};
17659 Float16 zm_inputs_6[] = {zero, inf_n, inf, inf, inf, zero};
17660
17661 // quiet_nan + (0.0 * inf) produces the default NaN, not quiet_nan. Ditto for
17662 // (inf * 0.0). On the other hand, quiet_nan + (inf * inf) propagates the
17663 // quiet_nan.
17664 uint64_t zd_expected_fmla_6[] =
17665 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc, sa_proc};
17666 uint64_t zd_expected_fmls_6[] =
17667 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc, sa_proc};
17668 uint64_t zd_expected_fnmla_6[] =
17669 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc_n, qa_proc_n, sa_proc_n};
17670 uint64_t zd_expected_fnmls_6[] =
17671 {d_inf_proc, d_inf_proc, d_inf_proc, inf_proc, qa_proc_n, sa_proc_n};
17672
17673 ProcessNaNsHelper3(config,
17674 kHRegSize,
17675 za_inputs_6,
17676 zn_inputs_6,
17677 zm_inputs_6,
17678 zd_expected_fmla_6,
17679 zd_expected_fmls_6,
17680 zd_expected_fnmla_6,
17681 zd_expected_fnmls_6,
17682 StrictNaNPropagation);
17683}
17684
TatWai Chong47c26842020-02-10 01:51:32 -080017685typedef void (MacroAssembler::*FCmpFn)(const PRegisterWithLaneSize& pd,
17686 const PRegisterZ& pg,
17687 const ZRegister& zn,
17688 const ZRegister& zm);
17689
TatWai Chonge3775132020-02-16 22:13:17 -080017690typedef void (MacroAssembler::*FCmpZeroFn)(const PRegisterWithLaneSize& pd,
17691 const PRegisterZ& pg,
Jacob Bramley5a5e71f2020-07-02 13:54:58 +010017692 const ZRegister& zn,
17693 double zero);
TatWai Chonge3775132020-02-16 22:13:17 -080017694
TatWai Chong47c26842020-02-10 01:51:32 -080017695typedef void (MacroAssembler::*CmpFn)(const PRegisterWithLaneSize& pd,
17696 const PRegisterZ& pg,
17697 const ZRegister& zn,
17698 const ZRegister& zm);
17699
17700static FCmpFn GetFpAbsCompareFn(Condition cond) {
17701 switch (cond) {
17702 case ge:
17703 return &MacroAssembler::Facge;
17704 case gt:
17705 return &MacroAssembler::Facgt;
17706 case le:
17707 return &MacroAssembler::Facle;
17708 case lt:
17709 return &MacroAssembler::Faclt;
17710 default:
17711 VIXL_UNIMPLEMENTED();
17712 return NULL;
17713 }
17714}
17715
17716static FCmpFn GetFpCompareFn(Condition cond) {
17717 switch (cond) {
17718 case ge:
17719 return &MacroAssembler::Fcmge;
17720 case gt:
17721 return &MacroAssembler::Fcmgt;
17722 case le:
17723 return &MacroAssembler::Fcmle;
17724 case lt:
17725 return &MacroAssembler::Fcmlt;
17726 case eq:
17727 return &MacroAssembler::Fcmeq;
17728 case ne:
17729 return &MacroAssembler::Fcmne;
17730 case uo:
17731 return &MacroAssembler::Fcmuo;
17732 default:
17733 VIXL_UNIMPLEMENTED();
17734 return NULL;
17735 }
17736}
17737
TatWai Chonge3775132020-02-16 22:13:17 -080017738static FCmpZeroFn GetFpCompareZeroFn(Condition cond) {
17739 switch (cond) {
17740 case ge:
17741 return &MacroAssembler::Fcmge;
17742 case gt:
17743 return &MacroAssembler::Fcmgt;
17744 case le:
17745 return &MacroAssembler::Fcmle;
17746 case lt:
17747 return &MacroAssembler::Fcmlt;
17748 case eq:
17749 return &MacroAssembler::Fcmeq;
17750 case ne:
17751 return &MacroAssembler::Fcmne;
17752 default:
17753 VIXL_UNIMPLEMENTED();
17754 return NULL;
17755 }
17756}
17757
TatWai Chong47c26842020-02-10 01:51:32 -080017758static CmpFn GetIntCompareFn(Condition cond) {
17759 switch (cond) {
17760 case ge:
17761 return &MacroAssembler::Cmpge;
17762 case gt:
17763 return &MacroAssembler::Cmpgt;
17764 case le:
17765 return &MacroAssembler::Cmple;
17766 case lt:
17767 return &MacroAssembler::Cmplt;
17768 case eq:
17769 return &MacroAssembler::Cmpeq;
17770 case ne:
17771 return &MacroAssembler::Cmpne;
17772 default:
17773 VIXL_UNIMPLEMENTED();
17774 return NULL;
17775 }
17776}
17777
17778template <size_t N>
17779static void TestFpCompareHelper(Test* config,
17780 int lane_size_in_bits,
17781 Condition cond,
17782 const double (&zn_inputs)[N],
17783 const double (&zm_inputs)[N],
17784 const int (&pd_expected)[N],
17785 bool is_absolute = false) {
17786 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17787 START();
17788
17789 ZRegister zt_int_1 = z1.WithLaneSize(lane_size_in_bits);
17790 ZRegister zt_int_2 = z2.WithLaneSize(lane_size_in_bits);
17791 ZRegister zt_int_3 = z3.WithLaneSize(lane_size_in_bits);
17792 ZRegister zt_fp_1 = z11.WithLaneSize(lane_size_in_bits);
17793 ZRegister zt_fp_2 = z12.WithLaneSize(lane_size_in_bits);
17794 ZRegister zt_fp_3 = z13.WithLaneSize(lane_size_in_bits);
17795 ZRegister fp_one = z31.WithLaneSize(lane_size_in_bits);
17796
17797 PRegisterWithLaneSize pd_result_int_1 = p15.WithLaneSize(lane_size_in_bits);
17798 PRegisterWithLaneSize pd_result_fp_1 = p14.WithLaneSize(lane_size_in_bits);
17799 PRegisterWithLaneSize pd_result_int_2 = p13.WithLaneSize(lane_size_in_bits);
17800 PRegisterWithLaneSize pd_result_fp_2 = p12.WithLaneSize(lane_size_in_bits);
17801
17802 FCmpFn fcmp = is_absolute ? GetFpAbsCompareFn(cond) : GetFpCompareFn(cond);
17803 __ Ptrue(p1.VnB());
17804
17805 if (cond != uo) {
17806 int pg_inputs[] = {1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1};
17807 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), pg_inputs);
17808
17809 __ Fdup(fp_one, 0.1f);
17810
17811 __ Index(zt_int_1, 3, 3);
17812 __ Scvtf(zt_fp_1, p0.Merging(), zt_int_1);
17813 __ Fadd(zt_fp_1, zt_fp_1, fp_one);
17814
17815 __ Index(zt_int_2, 3, -10);
17816 __ Scvtf(zt_fp_2, p0.Merging(), zt_int_2);
17817 __ Fadd(zt_fp_2, zt_fp_2, fp_one);
17818
17819 __ Index(zt_int_3, 3, 2);
17820 __ Scvtf(zt_fp_3, p0.Merging(), zt_int_3);
17821 __ Fadd(zt_fp_3, zt_fp_3, fp_one);
17822
17823
17824 // There is no absolute comparison in integer type, use `abs` with `cmp<cc>`
17825 // to synthesize the expected result for `fac<cc>`.
17826 if (is_absolute == true) {
17827 __ Abs(zt_int_2, p1.Merging(), zt_int_2);
17828 }
17829
17830 CmpFn cmp = GetIntCompareFn(cond);
17831 (masm.*cmp)(pd_result_int_1, p0.Zeroing(), zt_int_1, zt_int_2);
17832 (masm.*fcmp)(pd_result_fp_1, p0.Zeroing(), zt_fp_1, zt_fp_2);
17833
17834 (masm.*cmp)(pd_result_int_2, p0.Zeroing(), zt_int_1, zt_int_3);
17835 (masm.*fcmp)(pd_result_fp_2, p0.Zeroing(), zt_fp_1, zt_fp_3);
17836 }
17837
17838 uint64_t zn_inputs_rawbits[N];
17839 uint64_t zm_inputs_rawbits[N];
17840 FPToRawbitsWithSize(zn_inputs, zn_inputs_rawbits, lane_size_in_bits);
17841 FPToRawbitsWithSize(zm_inputs, zm_inputs_rawbits, lane_size_in_bits);
17842
17843 ZRegister zn_fp = z14.WithLaneSize(lane_size_in_bits);
17844 ZRegister zm_fp = z15.WithLaneSize(lane_size_in_bits);
17845 InsrHelper(&masm, zn_fp, zn_inputs_rawbits);
17846 InsrHelper(&masm, zm_fp, zm_inputs_rawbits);
17847
17848 PRegisterWithLaneSize pd_result_fp_3 = p11.WithLaneSize(lane_size_in_bits);
17849 (masm.*fcmp)(pd_result_fp_3, p1.Zeroing(), zn_fp, zm_fp);
17850
17851 END();
17852
17853 if (CAN_RUN()) {
17854 RUN();
17855
17856 if (cond != uo) {
17857 ASSERT_EQUAL_SVE(pd_result_int_1, pd_result_fp_1);
17858 ASSERT_EQUAL_SVE(pd_result_int_2, pd_result_fp_2);
17859 }
17860 ASSERT_EQUAL_SVE(pd_expected, pd_result_fp_3);
17861 }
17862}
17863
17864TEST_SVE(sve_fp_compare_vectors) {
17865 double inf_p = kFP64PositiveInfinity;
17866 double inf_n = kFP64NegativeInfinity;
17867 double nan = kFP64DefaultNaN;
17868
17869 // Normal floating point comparison has been tested in the helper.
17870 double zn[] = {0.0, inf_n, 1.0, inf_p, inf_p, nan, 0.0, nan};
17871 double zm[] = {-0.0, inf_n, inf_n, -2.0, inf_n, nan, nan, inf_p};
17872
17873 int pd_fcm_gt[] = {0, 0, 1, 1, 1, 0, 0, 0};
17874 int pd_fcm_lt[] = {0, 0, 0, 0, 0, 0, 0, 0};
17875 int pd_fcm_ge[] = {1, 1, 1, 1, 1, 0, 0, 0};
17876 int pd_fcm_le[] = {1, 1, 0, 0, 0, 0, 0, 0};
17877 int pd_fcm_eq[] = {1, 1, 0, 0, 0, 0, 0, 0};
Jacob Bramley4606adc2020-07-02 14:23:08 +010017878 int pd_fcm_ne[] = {0, 0, 1, 1, 1, 1, 1, 1};
TatWai Chong47c26842020-02-10 01:51:32 -080017879 int pd_fcm_uo[] = {0, 0, 0, 0, 0, 1, 1, 1};
17880 int pd_fac_gt[] = {0, 0, 0, 1, 0, 0, 0, 0};
17881 int pd_fac_lt[] = {0, 0, 1, 0, 0, 0, 0, 0};
17882 int pd_fac_ge[] = {1, 1, 0, 1, 1, 0, 0, 0};
17883 int pd_fac_le[] = {1, 1, 1, 0, 1, 0, 0, 0};
17884
17885 int lane_sizes[] = {kHRegSize, kSRegSize, kDRegSize};
17886
17887 for (size_t i = 0; i < ArrayLength(lane_sizes); i++) {
17888 int lane_size = lane_sizes[i];
17889 // Test floating-point compare vectors.
17890 TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fcm_gt);
17891 TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fcm_lt);
17892 TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fcm_ge);
17893 TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fcm_le);
17894 TestFpCompareHelper(config, lane_size, eq, zn, zm, pd_fcm_eq);
17895 TestFpCompareHelper(config, lane_size, ne, zn, zm, pd_fcm_ne);
17896 TestFpCompareHelper(config, lane_size, uo, zn, zm, pd_fcm_uo);
17897
17898 // Test floating-point absolute compare vectors.
17899 TestFpCompareHelper(config, lane_size, gt, zn, zm, pd_fac_gt, true);
17900 TestFpCompareHelper(config, lane_size, lt, zn, zm, pd_fac_lt, true);
17901 TestFpCompareHelper(config, lane_size, ge, zn, zm, pd_fac_ge, true);
17902 TestFpCompareHelper(config, lane_size, le, zn, zm, pd_fac_le, true);
17903 }
17904}
17905
TatWai Chonge3775132020-02-16 22:13:17 -080017906template <size_t N, typename T>
17907static void TestFpCompareZeroHelper(Test* config,
17908 int lane_size_in_bits,
17909 Condition cond,
17910 const T (&zn_inputs)[N],
17911 const int (&pd_expected)[N]) {
17912 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
17913 START();
17914
17915 ZRegister zn = z28.WithLaneSize(lane_size_in_bits);
17916 PRegisterWithLaneSize pd = p14.WithLaneSize(lane_size_in_bits);
17917
17918 uint64_t zn_rawbits[N];
17919 FPToRawbitsWithSize(zn_inputs, zn_rawbits, lane_size_in_bits);
17920 InsrHelper(&masm, zn, zn_rawbits);
17921
17922 __ Ptrue(p0.VnB());
Jacob Bramley5a5e71f2020-07-02 13:54:58 +010017923 (masm.*GetFpCompareZeroFn(cond))(pd, p0.Zeroing(), zn, 0.0);
TatWai Chonge3775132020-02-16 22:13:17 -080017924
17925 END();
17926
17927 if (CAN_RUN()) {
17928 RUN();
17929
17930 ASSERT_EQUAL_SVE(pd_expected, pd);
17931 }
17932}
17933
17934TEST_SVE(sve_fp_compare_vector_zero) {
17935 Float16 fp16_inf_p = kFP16PositiveInfinity;
17936 Float16 fp16_inf_n = kFP16NegativeInfinity;
17937 Float16 fp16_dn = kFP16DefaultNaN;
17938 Float16 fp16_sn = RawbitsToFloat16(0x7c22);
17939 Float16 fp16_qn = RawbitsToFloat16(0x7e55);
17940
17941 float fp32_inf_p = kFP32PositiveInfinity;
17942 float fp32_inf_n = kFP32NegativeInfinity;
17943 float fp32_dn = kFP32DefaultNaN;
17944 float fp32_sn = RawbitsToFloat(0x7f952222);
17945 float fp32_qn = RawbitsToFloat(0x7fea2222);
17946
17947 double fp64_inf_p = kFP64PositiveInfinity;
17948 double fp64_inf_n = kFP64NegativeInfinity;
17949 double fp64_dn = kFP64DefaultNaN;
17950 double fp64_sn = RawbitsToDouble(0x7ff5555511111111);
17951 double fp64_qn = RawbitsToDouble(0x7ffaaaaa11111111);
17952
17953 // Normal floating point comparison has been tested in the non-zero form.
17954 Float16 zn_inputs_h[] = {Float16(0.0),
17955 Float16(-0.0),
17956 fp16_inf_p,
17957 fp16_inf_n,
17958 fp16_dn,
17959 fp16_sn,
17960 fp16_qn};
17961 float zn_inputs_s[] =
17962 {0.0, -0.0, fp32_inf_p, fp32_inf_n, fp32_dn, fp32_sn, fp32_qn};
17963 double zn_inputs_d[] =
17964 {0.0, -0.0, fp64_inf_p, fp64_inf_n, fp64_dn, fp64_sn, fp64_qn};
17965
17966 int pd_expected_gt[] = {0, 0, 1, 0, 0, 0, 0};
17967 int pd_expected_lt[] = {0, 0, 0, 1, 0, 0, 0};
17968 int pd_expected_ge[] = {1, 1, 1, 0, 0, 0, 0};
17969 int pd_expected_le[] = {1, 1, 0, 1, 0, 0, 0};
17970 int pd_expected_eq[] = {1, 1, 0, 0, 0, 0, 0};
Jacob Bramley4606adc2020-07-02 14:23:08 +010017971 int pd_expected_ne[] = {0, 0, 1, 1, 1, 1, 1};
TatWai Chonge3775132020-02-16 22:13:17 -080017972
17973 TestFpCompareZeroHelper(config, kDRegSize, gt, zn_inputs_d, pd_expected_gt);
17974 TestFpCompareZeroHelper(config, kDRegSize, lt, zn_inputs_d, pd_expected_lt);
17975 TestFpCompareZeroHelper(config, kDRegSize, ge, zn_inputs_d, pd_expected_ge);
17976 TestFpCompareZeroHelper(config, kDRegSize, le, zn_inputs_d, pd_expected_le);
17977 TestFpCompareZeroHelper(config, kDRegSize, eq, zn_inputs_d, pd_expected_eq);
17978 TestFpCompareZeroHelper(config, kDRegSize, ne, zn_inputs_d, pd_expected_ne);
17979
17980 TestFpCompareZeroHelper(config, kSRegSize, gt, zn_inputs_s, pd_expected_gt);
17981 TestFpCompareZeroHelper(config, kSRegSize, lt, zn_inputs_s, pd_expected_lt);
17982 TestFpCompareZeroHelper(config, kSRegSize, ge, zn_inputs_s, pd_expected_ge);
17983 TestFpCompareZeroHelper(config, kSRegSize, le, zn_inputs_s, pd_expected_le);
17984 TestFpCompareZeroHelper(config, kSRegSize, eq, zn_inputs_s, pd_expected_eq);
17985 TestFpCompareZeroHelper(config, kSRegSize, ne, zn_inputs_s, pd_expected_ne);
17986
17987 TestFpCompareZeroHelper(config, kHRegSize, gt, zn_inputs_h, pd_expected_gt);
17988 TestFpCompareZeroHelper(config, kHRegSize, lt, zn_inputs_h, pd_expected_lt);
17989 TestFpCompareZeroHelper(config, kHRegSize, ge, zn_inputs_h, pd_expected_ge);
17990 TestFpCompareZeroHelper(config, kHRegSize, le, zn_inputs_h, pd_expected_le);
17991 TestFpCompareZeroHelper(config, kHRegSize, eq, zn_inputs_h, pd_expected_eq);
17992 TestFpCompareZeroHelper(config, kHRegSize, ne, zn_inputs_h, pd_expected_ne);
17993}
17994
TatWai Chong2cb1b612020-03-04 23:51:21 -080017995typedef void (MacroAssembler::*FPUnaryMFn)(const ZRegister& zd,
17996 const PRegisterM& pg,
17997 const ZRegister& zn);
17998
17999typedef void (MacroAssembler::*FPUnaryZFn)(const ZRegister& zd,
18000 const PRegisterZ& pg,
18001 const ZRegister& zn);
18002
18003template <size_t N, size_t M>
18004static void TestFPUnaryPredicatedHelper(Test* config,
18005 int src_size_in_bits,
18006 int dst_size_in_bits,
18007 uint64_t (&zn_inputs)[N],
18008 const uint64_t (&pg_inputs)[M],
18009 const uint64_t (&zd_expected)[N],
18010 FPUnaryMFn macro_m,
18011 FPUnaryZFn macro_z) {
18012 // Provide the full predicate input.
18013 VIXL_ASSERT(M == (kPRegMaxSize / kDRegSize));
18014 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18015 START();
18016
18017 int ds = dst_size_in_bits;
18018 int ss = src_size_in_bits;
18019 int ls = std::max(ss, ds);
18020
18021 // When destination type is larger than source type, fill the high parts with
18022 // noise values, which should be ignored.
18023 if (ds > ss) {
18024 VIXL_ASSERT(ss < 64);
18025 uint64_t zn_inputs_mod[N];
18026 uint64_t sn = GetSignallingNan(ss);
18027 for (unsigned i = 0; i < N; i++) {
18028 zn_inputs_mod[i] = zn_inputs[i] | ((sn + i) << ss);
18029 }
18030 InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs_mod);
18031 } else {
18032 InsrHelper(&masm, z29.WithLaneSize(ls), zn_inputs);
18033 }
18034
18035 // Make a copy so we can check that constructive operations preserve zn.
18036 __ Mov(z28, z29);
18037
18038 // Run the operation on all lanes.
18039 __ Ptrue(p0.WithLaneSize(ls));
18040 (masm.*macro_m)(z27.WithLaneSize(ds), p0.Merging(), z28.WithLaneSize(ss));
18041
18042 Initialise(&masm,
18043 p1.VnB(),
18044 pg_inputs[3],
18045 pg_inputs[2],
18046 pg_inputs[1],
18047 pg_inputs[0]);
18048
18049 // Clear the irrelevant lanes.
18050 __ Index(z31.WithLaneSize(ls), 0, 1);
18051 __ Cmplt(p1.WithLaneSize(ls), p1.Zeroing(), z31.WithLaneSize(ls), N);
18052
18053 // Check merging predication.
18054 __ Index(z11.WithLaneSize(ls), 42, 1);
18055 // Preserve the base value so we can derive the expected result.
18056 __ Mov(z21, z11);
18057 __ Mov(z9, z11);
18058 (masm.*macro_m)(z11.WithLaneSize(ds), p1.Merging(), z28.WithLaneSize(ss));
18059
18060 // Generate expected values using explicit merging operations.
18061 InsrHelper(&masm, z25.WithLaneSize(ls), zd_expected);
18062 __ Mov(z21.WithLaneSize(ls), p1.Merging(), z25.WithLaneSize(ls));
18063
18064 // Check zeroing predication.
18065 __ Index(z12.WithLaneSize(ds), 42, -1);
18066 (masm.*macro_z)(z12.WithLaneSize(ds), p1.Zeroing(), z28.WithLaneSize(ss));
18067
18068 // Generate expected values using explicit zeroing operations.
18069 InsrHelper(&masm, z30.WithLaneSize(ls), zd_expected);
18070 // Emulate zeroing predication.
18071 __ Dup(z22.WithLaneSize(ls), 0);
18072 __ Mov(z22.WithLaneSize(ls), p1.Merging(), z30.WithLaneSize(ls));
18073
18074 // Check an in-place update.
18075 __ Mov(z9.WithLaneSize(ls), p1.Merging(), z28.WithLaneSize(ls));
18076 (masm.*macro_m)(z9.WithLaneSize(ds), p1.Merging(), z9.WithLaneSize(ss));
18077
18078 END();
18079
18080 if (CAN_RUN()) {
18081 RUN();
18082
18083 // Check all lanes.
18084 ASSERT_EQUAL_SVE(zd_expected, z27.WithLaneSize(ls));
18085
18086 // Check that constructive operations preserve their inputs.
18087 ASSERT_EQUAL_SVE(z28, z29);
18088
18089 // Check merging predication.
18090 ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z21.WithLaneSize(ls));
18091
18092 // Check zeroing predication.
18093 ASSERT_EQUAL_SVE(z22.WithLaneSize(ls), z12.WithLaneSize(ls));
18094
18095 // Check in-place operation where zd == zn.
18096 ASSERT_EQUAL_SVE(z21.WithLaneSize(ls), z9.WithLaneSize(ls));
18097 }
18098}
18099
18100template <size_t N, typename T>
18101static void TestFPUnaryPredicatedHelper(Test* config,
18102 int src_size_in_bits,
18103 int dst_size_in_bits,
18104 T (&zn_inputs)[N],
18105 const T (&zd_expected)[N],
18106 FPUnaryMFn macro_m,
18107 FPUnaryZFn macro_z) {
18108 uint64_t pg_inputs[] = {0xa55aa55aa55aa55a,
18109 0xa55aa55aa55aa55a,
18110 0xa55aa55aa55aa55a,
18111 0xa55aa55aa55aa55a};
18112
18113 TestFPUnaryPredicatedHelper(config,
18114 src_size_in_bits,
18115 dst_size_in_bits,
18116 zn_inputs,
18117 pg_inputs,
18118 zd_expected,
18119 macro_m,
18120 macro_z);
18121
Josh Sorefb43d6ef2022-08-03 12:47:14 -040018122 // The complementary of above predicate to get full input coverage.
TatWai Chong2cb1b612020-03-04 23:51:21 -080018123 uint64_t pg_c_inputs[] = {0x5aa55aa55aa55aa5,
18124 0x5aa55aa55aa55aa5,
18125 0x5aa55aa55aa55aa5,
18126 0x5aa55aa55aa55aa5};
18127
18128 TestFPUnaryPredicatedHelper(config,
18129 src_size_in_bits,
18130 dst_size_in_bits,
18131 zn_inputs,
18132 pg_c_inputs,
18133 zd_expected,
18134 macro_m,
18135 macro_z);
18136}
18137
18138template <size_t N, typename T>
18139static void TestFcvtHelper(Test* config,
18140 int src_size_in_bits,
18141 int dst_size_in_bits,
18142 T (&zn_inputs)[N],
18143 const T (&zd_expected)[N]) {
18144 TestFPUnaryPredicatedHelper(config,
18145 src_size_in_bits,
18146 dst_size_in_bits,
18147 zn_inputs,
18148 zd_expected,
18149 &MacroAssembler::Fcvt, // Merging form.
18150 &MacroAssembler::Fcvt); // Zerging form.
18151}
18152
18153TEST_SVE(sve_fcvt) {
18154 uint64_t h_vals[] = {0x7c00,
18155 0xfc00,
18156 0,
18157 0x8000,
18158 0x7bff, // Max half precision.
18159 0x0400, // Min positive normal.
18160 0x03ff, // Max subnormal.
18161 0x0001}; // Min positive subnormal.
18162
18163 uint64_t s_vals[] = {0x7f800000,
18164 0xff800000,
18165 0,
18166 0x80000000,
18167 0x477fe000,
18168 0x38800000,
18169 0x387fc000,
18170 0x33800000};
18171
18172 uint64_t d_vals[] = {0x7ff0000000000000,
18173 0xfff0000000000000,
18174 0,
18175 0x8000000000000000,
18176 0x40effc0000000000,
18177 0x3f10000000000000,
18178 0x3f0ff80000000000,
18179 0x3e70000000000000};
18180
18181 TestFcvtHelper(config, kHRegSize, kSRegSize, h_vals, s_vals);
18182 TestFcvtHelper(config, kSRegSize, kHRegSize, s_vals, h_vals);
18183 TestFcvtHelper(config, kSRegSize, kDRegSize, s_vals, d_vals);
18184 TestFcvtHelper(config, kDRegSize, kSRegSize, d_vals, s_vals);
18185 TestFcvtHelper(config, kHRegSize, kDRegSize, h_vals, d_vals);
18186 TestFcvtHelper(config, kDRegSize, kHRegSize, d_vals, h_vals);
18187}
18188
18189TEST_SVE(sve_fcvt_nan) {
18190 uint64_t h_inputs[] = {0x7e55, // Quiet NaN.
18191 0x7c22}; // Signalling NaN.
18192
18193 uint64_t h2s_expected[] = {0x7fcaa000, 0x7fc44000};
18194
18195 uint64_t h2d_expected[] = {0x7ff9540000000000, 0x7ff8880000000000};
18196
18197 uint64_t s_inputs[] = {0x7fc12345, // Quiet NaN.
18198 0x7f812345}; // Signalling NaN.
18199
18200 uint64_t s2h_expected[] = {0x7e09, 0x7e09};
18201
18202 uint64_t s2d_expected[] = {0x7ff82468a0000000, 0x7ff82468a0000000};
18203
18204 uint64_t d_inputs[] = {0x7ffaaaaa22222222, // Quiet NaN.
18205 0x7ff5555511111111}; // Signalling NaN.
18206
18207 uint64_t d2h_expected[] = {0x7eaa, 0x7f55};
18208
18209 uint64_t d2s_expected[] = {0x7fd55551, 0x7feaaaa8};
18210
18211 TestFcvtHelper(config, kHRegSize, kSRegSize, h_inputs, h2s_expected);
18212 TestFcvtHelper(config, kSRegSize, kHRegSize, s_inputs, s2h_expected);
18213 TestFcvtHelper(config, kHRegSize, kDRegSize, h_inputs, h2d_expected);
18214 TestFcvtHelper(config, kDRegSize, kHRegSize, d_inputs, d2h_expected);
18215 TestFcvtHelper(config, kSRegSize, kDRegSize, s_inputs, s2d_expected);
18216 TestFcvtHelper(config, kDRegSize, kSRegSize, d_inputs, d2s_expected);
18217}
18218
TatWai Chongf60f6dc2020-02-21 10:48:11 -080018219template <size_t N, typename T>
18220static void TestFrecpxHelper(Test* config,
18221 int lane_size_in_bits,
18222 T (&zn_inputs)[N],
18223 const T (&zd_expected)[N]) {
18224 TestFPUnaryPredicatedHelper(config,
18225 lane_size_in_bits,
18226 lane_size_in_bits,
18227 zn_inputs,
18228 zd_expected,
18229 &MacroAssembler::Frecpx, // Merging form.
18230 &MacroAssembler::Frecpx); // Zerging form.
18231}
18232
18233TEST_SVE(sve_frecpx_h) {
18234 uint64_t zn_inputs[] = {Float16ToRawbits(kFP16PositiveInfinity),
18235 Float16ToRawbits(kFP16NegativeInfinity),
18236 Float16ToRawbits(Float16(0.0)),
18237 Float16ToRawbits(Float16(-0.0)),
18238 0x0001, // Smallest positive subnormal number.
18239 0x03ff, // Largest subnormal number.
18240 0x0400, // Smallest positive normal number.
18241 0x7bff, // Largest normal number.
18242 0x3bff, // Largest number less than one.
18243 0x3c01, // Smallest number larger than one.
18244 0x7c22, // Signalling NaN.
18245 0x7e55}; // Quiet NaN.
18246
18247 uint64_t zd_expected[] = {0,
18248 0x8000,
18249 0x7800,
18250 0xf800,
18251 // Exponent of subnormal numbers are zero.
18252 0x7800,
18253 0x7800,
18254 0x7800,
18255 0x0400,
18256 0x4400,
18257 0x4000,
18258 0x7e22, // To quiet NaN.
18259 0x7e55};
18260
18261 TestFrecpxHelper(config, kHRegSize, zn_inputs, zd_expected);
18262}
18263
18264TEST_SVE(sve_frecpx_s) {
18265 uint64_t zn_inputs[] = {FloatToRawbits(kFP32PositiveInfinity),
18266 FloatToRawbits(kFP32NegativeInfinity),
18267 FloatToRawbits(65504), // Max half precision.
18268 FloatToRawbits(6.10352e-5), // Min positive normal.
18269 FloatToRawbits(6.09756e-5), // Max subnormal.
18270 FloatToRawbits(
18271 5.96046e-8), // Min positive subnormal.
18272 FloatToRawbits(5e-9), // Not representable -> zero.
18273 FloatToRawbits(-0.0),
18274 FloatToRawbits(0.0),
18275 0x7f952222, // Signalling NaN.
18276 0x7fea2222}; // Quiet NaN;
18277
18278 uint64_t zd_expected[] = {0, // 0.0
18279 0x80000000, // -0.0
18280 0x38800000, // 6.10352e-05
18281 0x47000000, // 32768
18282 0x47800000, // 65536
18283 0x4c800000, // 6.71089e+07
18284 0x4e000000, // 5.36871e+08
18285 0xff000000, // -1.70141e+38
18286 0x7f000000, // 1.70141e+38
18287 0x7fd52222,
18288 0x7fea2222};
18289
18290 TestFrecpxHelper(config, kSRegSize, zn_inputs, zd_expected);
18291}
18292
18293TEST_SVE(sve_frecpx_d) {
18294 uint64_t zn_inputs[] = {DoubleToRawbits(kFP64PositiveInfinity),
18295 DoubleToRawbits(kFP64NegativeInfinity),
18296 DoubleToRawbits(65504), // Max half precision.
18297 DoubleToRawbits(6.10352e-5), // Min positive normal.
18298 DoubleToRawbits(6.09756e-5), // Max subnormal.
18299 DoubleToRawbits(
18300 5.96046e-8), // Min positive subnormal.
18301 DoubleToRawbits(5e-9), // Not representable -> zero.
18302 DoubleToRawbits(-0.0),
18303 DoubleToRawbits(0.0),
18304 0x7ff5555511111111, // Signalling NaN.
18305 0x7ffaaaaa11111111}; // Quiet NaN;
18306
18307 uint64_t zd_expected[] = {0, // 0.0
18308 0x8000000000000000, // -0.0
18309 0x3f10000000000000, // 6.10352e-05
18310 0x40e0000000000000, // 32768
18311 0x40f0000000000000, // 65536
18312 0x4190000000000000, // 6.71089e+07
18313 0x41c0000000000000, // 5.36871e+08
18314 0xffe0000000000000, // -1.70141e+38
18315 0x7fe0000000000000, // 1.70141e+38
18316 0x7ffd555511111111,
18317 0x7ffaaaaa11111111};
18318
18319 TestFrecpxHelper(config, kDRegSize, zn_inputs, zd_expected);
18320}
TatWai Chong2cb1b612020-03-04 23:51:21 -080018321
TatWai Chongb4a25f62020-02-27 00:53:57 -080018322template <size_t N, typename T>
18323static void TestFsqrtHelper(Test* config,
18324 int lane_size_in_bits,
18325 T (&zn_inputs)[N],
18326 const T (&zd_expected)[N]) {
18327 TestFPUnaryPredicatedHelper(config,
18328 lane_size_in_bits,
18329 lane_size_in_bits,
18330 zn_inputs,
18331 zd_expected,
18332 &MacroAssembler::Fsqrt, // Merging form.
18333 &MacroAssembler::Fsqrt); // Zerging form.
18334}
18335
18336TEST_SVE(sve_fsqrt_h) {
18337 uint64_t zn_inputs[] =
18338 {Float16ToRawbits(Float16(0.0)),
18339 Float16ToRawbits(Float16(-0.0)),
18340 Float16ToRawbits(Float16(1.0)),
18341 Float16ToRawbits(Float16(65025.0)),
18342 Float16ToRawbits(kFP16PositiveInfinity),
18343 Float16ToRawbits(kFP16NegativeInfinity),
18344 Float16ToRawbits(Float16(6.10352e-5)), // Min normal positive.
18345 Float16ToRawbits(Float16(65504.0)), // Max normal positive float.
18346 Float16ToRawbits(Float16(6.09756e-5)), // Max subnormal.
18347 Float16ToRawbits(Float16(5.96046e-8)), // Min subnormal positive.
18348 0x7c22, // Signaling NaN
18349 0x7e55}; // Quiet NaN
18350
18351 uint64_t zd_expected[] = {Float16ToRawbits(Float16(0.0)),
18352 Float16ToRawbits(Float16(-0.0)),
18353 Float16ToRawbits(Float16(1.0)),
18354 Float16ToRawbits(Float16(255.0)),
18355 Float16ToRawbits(kFP16PositiveInfinity),
18356 Float16ToRawbits(kFP16DefaultNaN),
18357 0x2000,
18358 0x5bff,
18359 0x1fff,
18360 0x0c00,
18361 0x7e22, // To quiet NaN.
18362 0x7e55};
18363
18364 TestFsqrtHelper(config, kHRegSize, zn_inputs, zd_expected);
18365}
18366
18367TEST_SVE(sve_fsqrt_s) {
18368 uint64_t zn_inputs[] = {FloatToRawbits(0.0f),
18369 FloatToRawbits(-0.0f),
18370 FloatToRawbits(1.0f),
18371 FloatToRawbits(65536.0f),
18372 FloatToRawbits(kFP32PositiveInfinity),
18373 FloatToRawbits(kFP32NegativeInfinity),
18374 0x00800000, // Min normal positive, ~1.17e−38
18375 0x7f7fffff, // Max normal positive, ~3.40e+38
18376 0x00000001, // Min subnormal positive, ~1.40e−45
18377 0x007fffff, // Max subnormal, ~1.17e−38
18378 0x7f951111, // Signaling NaN
18379 0x7fea1111}; // Quiet NaN
18380
18381 uint64_t zd_expected[] = {FloatToRawbits(0.0f),
18382 FloatToRawbits(-0.0f),
18383 FloatToRawbits(1.0f),
18384 FloatToRawbits(256.0f),
18385 FloatToRawbits(kFP32PositiveInfinity),
18386 FloatToRawbits(kFP32DefaultNaN),
18387 0x20000000, // ~1.08e-19
18388 0x5f7fffff, // ~1.84e+19
18389 0x1a3504f3, // ~3.74e-23
18390 0x1fffffff, // ~1.08e-19
18391 0x7fd51111, // To quiet NaN.
18392 0x7fea1111};
18393
18394 TestFsqrtHelper(config, kSRegSize, zn_inputs, zd_expected);
18395}
18396
18397TEST_SVE(sve_fsqrt_d) {
18398 uint64_t zn_inputs[] =
18399 {DoubleToRawbits(0.0),
18400 DoubleToRawbits(-0.0),
18401 DoubleToRawbits(1.0),
18402 DoubleToRawbits(65536.0),
18403 DoubleToRawbits(kFP64PositiveInfinity),
18404 DoubleToRawbits(kFP64NegativeInfinity),
18405 0x0010000000000000, // Min normal positive, ~2.22e-308
18406 0x7fefffffffffffff, // Max normal positive, ~1.79e+308
18407 0x0000000000000001, // Min subnormal positive, 5e-324
18408 0x000fffffffffffff, // Max subnormal, ~2.22e-308
18409 0x7ff5555511111111,
18410 0x7ffaaaaa11111111};
18411
18412 uint64_t zd_expected[] = {DoubleToRawbits(0.0),
18413 DoubleToRawbits(-0.0),
18414 DoubleToRawbits(1.0),
18415 DoubleToRawbits(256.0),
18416 DoubleToRawbits(kFP64PositiveInfinity),
18417 DoubleToRawbits(kFP64DefaultNaN),
18418 0x2000000000000000, // ~1.49e-154
18419 0x5fefffffffffffff, // ~1.34e+154
18420 0x1e60000000000000, // ~2.22e-162
18421 0x1fffffffffffffff, // ~1.49e-154
18422 0x7ffd555511111111, // To quiet NaN.
18423 0x7ffaaaaa11111111};
18424
18425 TestFsqrtHelper(config, kDRegSize, zn_inputs, zd_expected);
18426}
18427
Martyn Capewell48522f52020-03-16 15:31:19 +000018428TEST_SVE(sve_adr) {
18429 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18430 START();
18431
18432 __ Index(z0.VnD(), 0x10000000f0000000, 0x1000);
18433 __ Index(z1.VnD(), 1, 3);
18434 __ Index(z2.VnS(), -1, -1);
18435 __ Adr(z3.VnD(), SVEMemOperand(z0.VnD(), z1.VnD()));
18436 __ Adr(z4.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 1));
18437 __ Adr(z5.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 2));
18438 __ Adr(z6.VnD(), SVEMemOperand(z0.VnD(), z1.VnD(), LSL, 3));
18439 __ Adr(z7.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW));
18440 __ Adr(z8.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 1));
18441 __ Adr(z9.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 2));
18442 __ Adr(z10.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), UXTW, 3));
18443 __ Adr(z11.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW));
18444 __ Adr(z12.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 1));
18445 __ Adr(z13.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 2));
18446 __ Adr(z14.VnD(), SVEMemOperand(z0.VnD(), z2.VnD(), SXTW, 3));
18447 __ Adr(z15.VnS(), SVEMemOperand(z0.VnS(), z2.VnS()));
18448 __ Adr(z16.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 1));
18449 __ Adr(z17.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 2));
18450 __ Adr(z18.VnS(), SVEMemOperand(z0.VnS(), z2.VnS(), LSL, 3));
18451
18452 END();
18453
18454 if (CAN_RUN()) {
18455 RUN();
18456 uint64_t expected_z3[] = {0x10000000f0001004, 0x10000000f0000001};
18457 uint64_t expected_z4[] = {0x10000000f0001008, 0x10000000f0000002};
18458 uint64_t expected_z5[] = {0x10000000f0001010, 0x10000000f0000004};
18459 uint64_t expected_z6[] = {0x10000000f0001020, 0x10000000f0000008};
18460 uint64_t expected_z7[] = {0x10000001f0000ffd, 0x10000001efffffff};
18461 uint64_t expected_z8[] = {0x10000002f0000ffa, 0x10000002effffffe};
18462 uint64_t expected_z9[] = {0x10000004f0000ff4, 0x10000004effffffc};
18463 uint64_t expected_z10[] = {0x10000008f0000fe8, 0x10000008effffff8};
18464 uint64_t expected_z11[] = {0x10000000f0000ffd, 0x10000000efffffff};
18465 uint64_t expected_z12[] = {0x10000000f0000ffa, 0x10000000effffffe};
18466 uint64_t expected_z13[] = {0x10000000f0000ff4, 0x10000000effffffc};
18467 uint64_t expected_z14[] = {0x10000000f0000fe8, 0x10000000effffff8};
18468 uint64_t expected_z15[] = {0x0ffffffcf0000ffd, 0x0ffffffeefffffff};
18469 uint64_t expected_z16[] = {0x0ffffff8f0000ffa, 0x0ffffffceffffffe};
18470 uint64_t expected_z17[] = {0x0ffffff0f0000ff4, 0x0ffffff8effffffc};
18471 uint64_t expected_z18[] = {0x0fffffe0f0000fe8, 0x0ffffff0effffff8};
18472
18473 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
18474 ASSERT_EQUAL_SVE(expected_z4, z4.VnD());
18475 ASSERT_EQUAL_SVE(expected_z5, z5.VnD());
18476 ASSERT_EQUAL_SVE(expected_z6, z6.VnD());
18477 ASSERT_EQUAL_SVE(expected_z7, z7.VnD());
18478 ASSERT_EQUAL_SVE(expected_z8, z8.VnD());
18479 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
18480 ASSERT_EQUAL_SVE(expected_z10, z10.VnD());
18481 ASSERT_EQUAL_SVE(expected_z11, z11.VnD());
18482 ASSERT_EQUAL_SVE(expected_z12, z12.VnD());
18483 ASSERT_EQUAL_SVE(expected_z13, z13.VnD());
18484 ASSERT_EQUAL_SVE(expected_z14, z14.VnD());
18485 ASSERT_EQUAL_SVE(expected_z15, z15.VnD());
18486 ASSERT_EQUAL_SVE(expected_z16, z16.VnD());
18487 ASSERT_EQUAL_SVE(expected_z17, z17.VnD());
18488 ASSERT_EQUAL_SVE(expected_z18, z18.VnD());
18489 }
18490}
18491
TatWai Chong85e15102020-05-04 21:00:40 -070018492// Test loads and broadcast by comparing them with the result of a set of
18493// equivalent scalar loads.
18494template <typename F>
18495static void LoadBcastHelper(Test* config,
18496 unsigned msize_in_bits,
18497 unsigned esize_in_bits,
18498 F sve_ld1,
18499 bool is_signed) {
18500 VIXL_ASSERT((esize_in_bits == kBRegSize) || (esize_in_bits == kHRegSize) ||
18501 (esize_in_bits == kSRegSize) || (esize_in_bits == kDRegSize));
18502 static const unsigned kMaxLaneCount = kZRegMaxSize / kBRegSize;
18503
18504 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18505 START();
18506
18507 unsigned msize_in_bytes = msize_in_bits / kBitsPerByte;
18508 unsigned esize_in_bytes = esize_in_bits / kBitsPerByte;
18509 int vl = config->sve_vl_in_bytes();
18510
18511 uint64_t offsets[kMaxLaneCount];
18512 uint64_t buffer_size = vl * 64;
18513 uint64_t data = reinterpret_cast<uintptr_t>(malloc(buffer_size));
18514 BufferFillingHelper(data,
18515 buffer_size,
18516 msize_in_bytes,
18517 kMaxLaneCount,
18518 offsets);
18519
18520 for (unsigned i = 0; i < (kMaxLaneCount / 2); i++) {
18521 // Assign encodable offsets into the first part of the offset array so
18522 // that both encodable and unencodable offset can be tested.
18523 // Note that the encoding bit range of immediate offset is 6 bits.
18524 offsets[i] = (offsets[i] % (UINT64_C(1) << 6)) * msize_in_bytes;
18525 }
18526
18527 ZRegister zn = z0.WithLaneSize(esize_in_bits);
18528 ZRegister zn_ref = z4.WithLaneSize(esize_in_bits);
18529
18530 PRegisterZ pg = p0.Zeroing();
18531 Initialise(&masm,
18532 pg,
18533 0x9abcdef012345678,
18534 0xabcdef0123456789,
18535 0xf4f3f1f0fefdfcfa,
18536 0xf9f8f6f5f3f2f0ff);
18537
18538 __ Mov(x2, data);
18539 uint64_t enablable_offset = offsets[0];
18540 // Simple check if the operation correct in a single offset.
18541 (masm.*sve_ld1)(zn, pg, SVEMemOperand(x2, enablable_offset));
18542
18543 // Generate a reference result using scalar loads.
18544 uint64_t address = data + enablable_offset;
18545 uint64_t duplicated_addresses[kMaxLaneCount];
18546 for (unsigned i = 0; i < kMaxLaneCount; i++) {
18547 duplicated_addresses[i] = address;
18548 }
18549
18550 ScalarLoadHelper(&masm,
18551 vl,
18552 duplicated_addresses,
18553 zn_ref,
18554 pg,
18555 esize_in_bits,
18556 msize_in_bits,
18557 is_signed);
18558
18559 ZRegister zn_agg = z10.WithLaneSize(esize_in_bits);
18560 ZRegister zn_agg_ref = z11.WithLaneSize(esize_in_bits);
18561 ZRegister zn_temp = z12.WithLaneSize(esize_in_bits);
18562
18563 __ Dup(zn_agg, 0);
18564 __ Dup(zn_agg_ref, 0);
18565
18566 // Check if the operation correct in different offsets.
18567 for (unsigned i = 0; i < (vl / esize_in_bytes); i++) {
18568 (masm.*sve_ld1)(zn_temp, pg, SVEMemOperand(x2, offsets[i]));
18569 __ Lastb(x1, pg, zn_temp);
18570 __ Insr(zn_agg, x1);
18571
18572 __ Mov(x3, data + offsets[i]);
18573 ScalarLoadHelper(&masm, x1, x3, msize_in_bits, is_signed);
18574 __ Insr(zn_agg_ref, x1);
18575 }
18576
18577 END();
18578
18579 if (CAN_RUN()) {
18580 RUN();
18581
18582 ASSERT_EQUAL_SVE(zn_ref, zn);
18583 ASSERT_EQUAL_SVE(zn_agg_ref, zn_agg);
18584 }
18585
18586 free(reinterpret_cast<void*>(data));
18587}
18588
18589TEST_SVE(sve_ld1rb) {
18590 LoadBcastHelper(config, kBRegSize, kBRegSize, &MacroAssembler::Ld1rb, false);
18591 LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rb, false);
18592 LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rb, false);
18593 LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rb, false);
18594}
18595
18596TEST_SVE(sve_ld1rh) {
18597 LoadBcastHelper(config, kHRegSize, kHRegSize, &MacroAssembler::Ld1rh, false);
18598 LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rh, false);
18599 LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rh, false);
18600}
18601
18602TEST_SVE(sve_ld1rw) {
18603 LoadBcastHelper(config, kSRegSize, kSRegSize, &MacroAssembler::Ld1rw, false);
18604 LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rw, false);
18605}
18606
18607TEST_SVE(sve_ld1rd) {
18608 LoadBcastHelper(config, kDRegSize, kDRegSize, &MacroAssembler::Ld1rd, false);
18609}
18610
18611TEST_SVE(sve_ld1rsb) {
18612 LoadBcastHelper(config, kBRegSize, kHRegSize, &MacroAssembler::Ld1rsb, true);
18613 LoadBcastHelper(config, kBRegSize, kSRegSize, &MacroAssembler::Ld1rsb, true);
18614 LoadBcastHelper(config, kBRegSize, kDRegSize, &MacroAssembler::Ld1rsb, true);
18615}
18616
18617TEST_SVE(sve_ld1rsh) {
18618 LoadBcastHelper(config, kHRegSize, kSRegSize, &MacroAssembler::Ld1rsh, true);
18619 LoadBcastHelper(config, kHRegSize, kDRegSize, &MacroAssembler::Ld1rsh, true);
18620}
18621
18622TEST_SVE(sve_ld1rsw) {
18623 LoadBcastHelper(config, kSRegSize, kDRegSize, &MacroAssembler::Ld1rsw, true);
18624}
18625
TatWai Chong3db2c492020-03-29 22:20:41 -070018626TEST_SVE(sve_prefetch_offset) {
18627 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
18628
18629 START();
18630
18631 __ Prfb(PLDL1KEEP, p5, SVEMemOperand(z30.VnS(), 0));
18632 __ Prfb(PLDL1STRM, p5, SVEMemOperand(x28, -11, SVE_MUL_VL));
Martyn Capewellecca4b12020-07-02 14:30:50 +010018633 __ Prfb(PLDL2KEEP, p6, SVEMemOperand(x30, x29));
TatWai Chong3db2c492020-03-29 22:20:41 -070018634 __ Prfb(PLDL2STRM, p6, SVEMemOperand(x7, z12.VnS(), UXTW));
18635 __ Prfh(PSTL2KEEP, p6, SVEMemOperand(z0.VnS(), 28));
18636 __ Prfh(PSTL2STRM, p4, SVEMemOperand(x17, -3, SVE_MUL_VL));
Martyn Capewell102e7a52020-07-02 11:24:11 +010018637 __ Prfh(PSTL3KEEP, p3, SVEMemOperand(x0, x0, LSL, 1));
18638 __ Prfh(PSTL3STRM, p4, SVEMemOperand(x20, z0.VnD(), LSL, 1));
TatWai Chong3db2c492020-03-29 22:20:41 -070018639 __ Prfw(PLDL1KEEP, p3, SVEMemOperand(z23.VnD(), 5));
18640 __ Prfw(PLDL1STRM, p1, SVEMemOperand(x4, 10, SVE_MUL_VL));
Martyn Capewell102e7a52020-07-02 11:24:11 +010018641 __ Prfw(PLDL2KEEP, p2, SVEMemOperand(x22, x22, LSL, 2));
18642 __ Prfw(PLDL2STRM, p1, SVEMemOperand(x2, z6.VnS(), SXTW, 2));
TatWai Chong3db2c492020-03-29 22:20:41 -070018643 __ Prfd(PLDL3KEEP, p5, SVEMemOperand(z11.VnD(), 9));
18644 __ Prfd(PLDL3STRM, p3, SVEMemOperand(x0, -24, SVE_MUL_VL));
Martyn Capewell102e7a52020-07-02 11:24:11 +010018645 __ Prfd(PSTL1KEEP, p7, SVEMemOperand(x5, x5, LSL, 3));
18646 __ Prfd(PSTL1STRM, p1, SVEMemOperand(x19, z18.VnS(), SXTW, 3));
TatWai Chong3db2c492020-03-29 22:20:41 -070018647
18648 END();
18649 if (CAN_RUN()) {
18650 RUN();
18651 }
18652}
18653
Martyn Capewell51643312020-08-24 15:58:57 +010018654TEST_SVE(sve2_match_nmatch) {
18655 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18656
18657 START();
18658
18659 __ Ptrue(p0.VnB());
18660 __ Ptrue(p1.VnH());
18661 __ Ptrue(p2.VnS());
18662
18663 // Vector to search is bytes 0 - 7, repeating every eight bytes.
18664 __ Index(z0.VnB(), 0, 1);
18665 __ Dup(z0.VnD(), z0.VnD(), 0);
18666
18667 // Elements to find are (repeated) bytes 0 - 3 in the first segment, 4 - 7
18668 // in the second, 8 - 11 in the third, etc.
18669 __ Index(z1.VnB(), 0, 1);
18670 __ Lsr(z1.VnB(), z1.VnB(), 2);
18671
18672 __ Match(p3.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
18673 __ Match(p4.VnB(), p1.Zeroing(), z0.VnB(), z1.VnB());
18674 __ Nmatch(p0.VnB(), p0.Zeroing(), z0.VnB(), z1.VnB());
18675
18676 __ Uunpklo(z0.VnH(), z0.VnB());
18677 __ Uunpklo(z1.VnH(), z1.VnB());
18678
18679 __ Match(p5.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
18680 __ Match(p6.VnH(), p2.Zeroing(), z0.VnH(), z1.VnH());
18681 __ Nmatch(p1.VnH(), p1.Zeroing(), z0.VnH(), z1.VnH());
18682
18683 END();
18684 if (CAN_RUN()) {
18685 RUN();
18686
18687 int p3_exp[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
18688 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
18689 ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
18690 int p4_exp[] = {0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
18691 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1};
18692 ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
18693 int p0_exp[] = {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
18694 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0};
18695 ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
18696
18697 int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
18698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1};
18699 ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
18700 int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
18701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
18702 ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
18703 int p1_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
18704 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0};
18705 ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
18706 }
18707}
18708
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018709TEST_SVE(sve2_saba_uaba) {
18710 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18711
18712 START();
18713
18714 __ Index(z0.VnB(), 0, 1);
18715 __ Dup(z1.VnB(), 0xff);
18716 __ Dup(z2.VnB(), 1);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018717 __ Uaba(z2.VnB(), z2.VnB(), z0.VnB(), z1.VnB());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018718 __ Index(z0.VnB(), 0, -1);
18719
18720 __ Index(z3.VnH(), 0, 1);
18721 __ Index(z4.VnH(), 1, 1);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018722 __ Uaba(z3.VnH(), z3.VnH(), z3.VnH(), z4.VnH());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018723
18724 __ Index(z5.VnS(), 3, 6);
18725 __ Index(z6.VnS(), 5, 6);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018726 __ Uaba(z5.VnS(), z5.VnS(), z5.VnS(), z6.VnS());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018727
18728 __ Index(z7.VnD(), 424, 12);
18729 __ Index(z8.VnD(), 4242, 12);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018730 __ Uaba(z7.VnD(), z7.VnD(), z7.VnD(), z8.VnD());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018731
18732 __ Index(z9.VnH(), -1, -1);
18733 __ Dup(z10.VnB(), 0);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018734 __ Saba(z10.VnB(), z10.VnB(), z9.VnB(), z10.VnB());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018735 __ Index(z11.VnH(), 0x0101, 1);
18736
18737 __ Index(z12.VnH(), 0, 1);
18738 __ Index(z13.VnH(), 0, -1);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018739 __ Saba(z13.VnH(), z13.VnH(), z12.VnH(), z13.VnH());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018740
18741 __ Index(z14.VnS(), 0, 2);
18742 __ Index(z15.VnS(), 0, -2);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018743 __ Saba(z15.VnS(), z15.VnS(), z14.VnS(), z15.VnS());
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018744
18745 __ Index(z16.VnD(), 0, 42);
18746 __ Index(z17.VnD(), 0, -42);
Martyn Capewell67d2f822020-10-13 16:39:33 +010018747 __ Saba(z17.VnD(), z17.VnD(), z16.VnD(), z17.VnD());
TatWai Chong236e7ae2020-09-13 14:55:04 -070018748
Martyn Capewelleb37ef32020-09-09 16:46:41 +010018749 END();
18750
18751 if (CAN_RUN()) {
18752 RUN();
18753
18754 ASSERT_EQUAL_SVE(z0, z2);
18755 ASSERT_EQUAL_SVE(z3, z4);
18756 ASSERT_EQUAL_SVE(z5, z6);
18757 ASSERT_EQUAL_SVE(z7, z8);
18758
18759 ASSERT_EQUAL_SVE(z10, z11);
18760 ASSERT_EQUAL_SVE(z12, z13);
18761 ASSERT_EQUAL_SVE(z14, z15);
18762 ASSERT_EQUAL_SVE(z16, z17);
18763 }
18764}
18765
TatWai Chong236e7ae2020-09-13 14:55:04 -070018766TEST_SVE(sve2_integer_multiply_long_vector) {
18767 // The test just check Sqdmull[b|t] and Pmull[b|t], as the way how the element
18768 // operating of the other instructions in the group are likewise.
18769 int32_t zn_inputs_s[] =
18770 {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};
18771
18772 int32_t zm_inputs_s[] =
18773 {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
TatWai Chong1719b712020-09-25 18:16:40 -070018774 int64_t sqdmullb_vec_expected_d[] =
TatWai Chong5c080292020-12-13 03:57:15 -080018775 {-8, -32, -72, -128, RawbitsToInt64(0x8000000100000000), INT64_MAX};
TatWai Chong236e7ae2020-09-13 14:55:04 -070018776
TatWai Chong1719b712020-09-25 18:16:40 -070018777 uint64_t sqdmullt_vec_expected_d[] =
18778 {2, 18, 50, 98, 0x8000000100000000, 0x7ffffffe00000002};
TatWai Chong236e7ae2020-09-13 14:55:04 -070018779
TatWai Chong1719b712020-09-25 18:16:40 -070018780 uint64_t pmullb_vec_expected_d[] = {0x00000001fffffffc,
18781 0x00000003fffffff0,
18782 0x000000020000001c,
18783 0x00000007ffffffc0,
18784 0x3fffffff80000000,
18785 0x4000000000000000};
TatWai Chong236e7ae2020-09-13 14:55:04 -070018786
TatWai Chong1719b712020-09-25 18:16:40 -070018787 uint64_t pmullt_vec_expected_d[] = {0x05,
18788 0x11,
18789 0x15,
18790 0x3fffffff80000000,
18791 0x1555555555555555};
18792
18793 uint64_t sqdmullb_idx_expected_d[] = {0xfffffffffffffff8,
18794 0xfffffffffffffff0,
18795 0xffffffffffffffb8,
18796 0xffffffffffffffa0,
18797 0x8000000100000000,
18798 INT64_MAX};
18799
18800 uint64_t sqdmullt_idx_expected_d[] =
18801 {8, // 2 * zn[11] * zm[8] = 2 * 4 * 1
18802 24, // 2 * zn[9] * zm[8] = 2 * 4 * 3
18803 80, // 2 * zn[7] * zm[4] = 2 * 8 * 5
18804 112, // 2 * zn[5] * zm[4] = 2 * 8 * 7
18805 0x7fffffffffffffff, // 2 * zn[3] * zm[0]
18806 0x8000000100000000}; // 2 * zn[1] * zm[0]
TatWai Chong236e7ae2020-09-13 14:55:04 -070018807
18808 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18809 START();
18810
18811 InsrHelper(&masm, z31.VnS(), zn_inputs_s);
18812 InsrHelper(&masm, z30.VnS(), zm_inputs_s);
18813
18814 __ Sqdmullb(z1.VnD(), z31.VnS(), z30.VnS());
18815 __ Sqdmullt(z2.VnD(), z31.VnS(), z30.VnS());
TatWai Chong1719b712020-09-25 18:16:40 -070018816
TatWai Chong236e7ae2020-09-13 14:55:04 -070018817 __ Pmullb(z3.VnD(), z31.VnS(), z30.VnS());
18818 __ Pmullt(z4.VnD(), z31.VnS(), z30.VnS());
18819
TatWai Chong1719b712020-09-25 18:16:40 -070018820 __ Mov(z7, z30);
18821 __ Mov(z8, z31);
18822 __ Sqdmullb(z5.VnD(), z8.VnS(), z7.VnS(), 2);
18823 __ Sqdmullt(z6.VnD(), z8.VnS(), z7.VnS(), 0);
18824
TatWai Chong236e7ae2020-09-13 14:55:04 -070018825 END();
18826
18827 if (CAN_RUN()) {
18828 RUN();
18829
TatWai Chong1719b712020-09-25 18:16:40 -070018830 ASSERT_EQUAL_SVE(sqdmullb_vec_expected_d, z1.VnD());
18831 ASSERT_EQUAL_SVE(sqdmullt_vec_expected_d, z2.VnD());
18832 ASSERT_EQUAL_SVE(pmullb_vec_expected_d, z3.VnD());
18833 ASSERT_EQUAL_SVE(pmullt_vec_expected_d, z4.VnD());
18834 ASSERT_EQUAL_SVE(sqdmullb_idx_expected_d, z5.VnD());
18835 ASSERT_EQUAL_SVE(sqdmullt_idx_expected_d, z6.VnD());
TatWai Chong236e7ae2020-09-13 14:55:04 -070018836 }
18837}
18838
TatWai Chong3cb35a62020-12-05 21:22:08 -080018839TEST_SVE(sve2_integer_multiply_add_long_vector) {
18840 int32_t zn_inputs_s[] =
18841 {1, -2, 3, -4, 5, -6, 7, -8, INT32_MIN, INT32_MAX, INT32_MAX, INT32_MIN};
18842
18843 int32_t zm_inputs_s[] =
18844 {1, 2, 3, 4, 5, 6, 7, 8, INT32_MAX, INT32_MIN, INT32_MAX, INT32_MIN};
18845
18846 int64_t sqdmlalb_vec_expected_d[] =
TatWai Chong5c080292020-12-13 03:57:15 -080018847 {-3, -28, -69, -126, RawbitsToInt64(0x8000000100000001), INT64_MAX};
TatWai Chong3cb35a62020-12-05 21:22:08 -080018848
18849 int64_t sqdmlalt_vec_expected_d[] = {-3,
18850 14,
18851 47,
18852 96,
TatWai Chong5c080292020-12-13 03:57:15 -080018853 RawbitsToInt64(0x80000000ffffffff),
TatWai Chong3cb35a62020-12-05 21:22:08 -080018854 static_cast<int64_t>(
18855 0x7ffffffe00000002)};
18856
TatWai Chong05545662020-12-18 20:53:05 -080018857 int64_t sqdmlalb_idx_expected_d[] =
18858 {-11, // za.d[5] + 2 * zn.s[10] * zm.s[8] = 5 + 2 * -2 * 4
18859 -28, // za.d[4] + 2 * zn.s[8] * zm.s[8] = 4 + 2 * -4 * 4
18860 -93, // za.d[3] + 2 * zn.s[6] * zm.s[4] = 3 + 2 * -6 * 8
18861 -126, // za.d[2] + 2 * zn.s[4] * zm.s[4] = 2 + 2 * -8 * 8
18862 RawbitsToInt64(0x8000000100000001),
18863 INT64_MAX};
18864
18865 int64_t sqdmlalt_idx_expected_d[] =
18866 {1, // za.d[5] + 2 * zn.s[11] * zm.s[9] = -5 + 2 * 1 * 3
18867 14, // za.d[4] + 2 * zn.s[9] * zm.s[9] = -4 + 2 * 3 * 3
18868 67, // za.d[3] + 2 * zn.s[7] * zm.s[5] = -3 + 2 * 5 * 7
18869 96, // za.d[2] + 2 * zn.s[5] * zm.s[5] = -2 + 2 * 7 * 7
18870 RawbitsToInt64(0x80000000ffffffff),
18871 static_cast<int64_t>(0x7ffffffe00000002)};
18872
TatWai Chong3cb35a62020-12-05 21:22:08 -080018873 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18874 START();
18875
TatWai Chong05545662020-12-18 20:53:05 -080018876 InsrHelper(&masm, z0.VnS(), zn_inputs_s);
18877 InsrHelper(&masm, z1.VnS(), zm_inputs_s);
18878 __ Index(z2.VnD(), 0, 1);
18879 __ Index(z3.VnD(), 0, -1);
TatWai Chong3cb35a62020-12-05 21:22:08 -080018880
TatWai Chong05545662020-12-18 20:53:05 -080018881 __ Mov(z31, z2);
18882 __ Sqdmlalb(z31.VnD(), z31.VnD(), z0.VnS(), z1.VnS());
18883 __ Mov(z30, z3);
18884 __ Sqdmlalt(z30.VnD(), z30.VnD(), z0.VnS(), z1.VnS());
18885 __ Mov(z29, z31);
18886 __ Sqdmlslb(z29.VnD(), z29.VnD(), z0.VnS(), z1.VnS());
18887 __ Mov(z28, z30);
18888 __ Sqdmlslt(z28.VnD(), z28.VnD(), z0.VnS(), z1.VnS());
TatWai Chong3cb35a62020-12-05 21:22:08 -080018889
Martyn Capewell13c08b72021-03-11 11:48:52 +000018890 __ Sqdmlalb(z27.VnD(), z2.VnD(), z0.VnS(), z1.VnS());
18891 __ Sqdmlalt(z26.VnD(), z3.VnD(), z0.VnS(), z1.VnS());
18892 __ Sqdmlslb(z25.VnD(), z27.VnD(), z0.VnS(), z1.VnS());
18893 __ Sqdmlslt(z24.VnD(), z26.VnD(), z0.VnS(), z1.VnS());
TatWai Chong05545662020-12-18 20:53:05 -080018894
18895 __ Mov(z23, z2);
18896 __ Sqdmlalb(z23.VnD(), z23.VnD(), z0.VnS(), z1.VnS(), 0);
18897 __ Mov(z22, z3);
18898 __ Sqdmlalt(z22.VnD(), z22.VnD(), z0.VnS(), z1.VnS(), 1);
18899 __ Mov(z21, z23);
18900 __ Sqdmlslb(z21.VnD(), z21.VnD(), z0.VnS(), z1.VnS(), 0);
18901 __ Mov(z20, z22);
18902 __ Sqdmlslt(z20.VnD(), z20.VnD(), z0.VnS(), z1.VnS(), 1);
18903
TatWai Chong3cb35a62020-12-05 21:22:08 -080018904
18905 END();
18906
18907 if (CAN_RUN()) {
18908 RUN();
18909
TatWai Chong05545662020-12-18 20:53:05 -080018910 ASSERT_EQUAL_SVE(sqdmlalb_vec_expected_d, z31.VnD());
18911 ASSERT_EQUAL_SVE(sqdmlalt_vec_expected_d, z30.VnD());
18912 ASSERT_EQUAL_SVE(z2, z29);
18913 ASSERT_EQUAL_SVE(z3, z28);
TatWai Chong3cb35a62020-12-05 21:22:08 -080018914
Martyn Capewell13c08b72021-03-11 11:48:52 +000018915 ASSERT_EQUAL_SVE(z31, z27);
18916 ASSERT_EQUAL_SVE(z30, z26);
18917 ASSERT_EQUAL_SVE(z29, z25);
18918 ASSERT_EQUAL_SVE(z28, z24);
TatWai Chong05545662020-12-18 20:53:05 -080018919
18920 ASSERT_EQUAL_SVE(sqdmlalb_idx_expected_d, z23.VnD());
18921 ASSERT_EQUAL_SVE(sqdmlalt_idx_expected_d, z22.VnD());
18922 ASSERT_EQUAL_SVE(z2, z21);
18923 ASSERT_EQUAL_SVE(z3, z20);
TatWai Chong3cb35a62020-12-05 21:22:08 -080018924 }
18925}
18926
Martyn Capewell932cbb02020-11-10 16:53:31 +000018927TEST_SVE(sve2_ldnt1) {
18928 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18929 START();
18930
18931 int data_size = kZRegMaxSizeInBytes * 4;
18932 uint8_t* data = new uint8_t[data_size];
18933 for (int i = 0; i < data_size; i++) {
18934 data[i] = i & 0xff;
18935 }
18936
18937 // Set the base half-way through the buffer so we can use negative indices.
18938 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
18939 __ Index(z30.VnD(), x0, 1);
18940 __ Ptrue(p0.VnB());
18941 __ Punpklo(p1.VnH(), p0.VnB());
18942 __ Punpklo(p2.VnH(), p1.VnB());
18943 __ Punpklo(p3.VnH(), p2.VnB());
18944 __ Punpklo(p4.VnH(), p3.VnB());
18945
18946 __ Mov(x1, 1);
18947 __ Ldnt1b(z0.VnD(), p1.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18948 __ Ld1b(z1.VnD(), p1.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18949
18950 __ Mov(x1, -4);
18951 __ Ldnt1h(z2.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18952 __ Ld1h(z3.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18953
18954 __ Mov(x1, 16);
18955 __ Ldnt1w(z4.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18956 __ Ld1w(z5.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18957
18958 __ Mov(x1, -16);
18959 __ Ldnt1d(z6.VnD(), p4.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18960 __ Ld1d(z7.VnD(), p4.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18961
18962 __ Mov(x1, 1);
18963 __ Ldnt1sb(z8.VnD(), p0.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18964 __ Ld1sb(z9.VnD(), p0.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18965
18966 __ Mov(x1, -4);
18967 __ Ldnt1sh(z10.VnD(), p2.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18968 __ Ld1sh(z11.VnD(), p2.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18969
18970 __ Mov(x1, 16);
18971 __ Ldnt1sw(z12.VnD(), p3.Zeroing(), SVEMemOperand(z30.VnD(), x1));
18972 __ Ld1sw(z13.VnD(), p3.Zeroing(), SVEMemOperand(x1, z30.VnD()));
18973
18974 END();
18975
18976 if (CAN_RUN()) {
18977 RUN();
18978 ASSERT_EQUAL_SVE(z0, z1);
18979 ASSERT_EQUAL_SVE(z2, z3);
18980 ASSERT_EQUAL_SVE(z4, z5);
18981 ASSERT_EQUAL_SVE(z6, z7);
18982 ASSERT_EQUAL_SVE(z8, z9);
18983 ASSERT_EQUAL_SVE(z10, z11);
18984 ASSERT_EQUAL_SVE(z12, z13);
18985 }
18986}
18987
Martyn Capewell1ac83572020-11-11 16:15:58 +000018988TEST_SVE(sve2_stnt1) {
18989 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
18990 START();
18991
18992 int data_size = kZRegMaxSizeInBytes * 4;
18993 uint8_t* data = new uint8_t[data_size];
18994
18995 // Set the base half-way through the buffer so we can use negative indices.
18996 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size / 2]));
18997 __ Ptrue(p0.VnB());
18998 __ Punpklo(p1.VnH(), p0.VnB());
18999 __ Punpklo(p2.VnH(), p1.VnB());
19000 __ Punpklo(p3.VnH(), p2.VnB());
19001 __ Punpklo(p4.VnH(), p3.VnB());
19002 __ Dup(z0.VnB(), 0xaa);
19003 __ Dup(z1.VnB(), 0x55);
19004 __ Rdvl(x1, 1);
19005 __ Mov(x3, 0);
19006
19007 // Put store addresses into z30, and a small offset in x4.
19008 __ Index(z30.VnD(), x0, 1);
19009 __ Mov(x4, 2);
19010
19011 // Store an entire vector of 0xaa to the buffer, then a smaller scatter store
19012 // of 0x55 using Stnt1b.
19013 __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19014 __ Stnt1b(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19015
19016 // Load the entire vector back from the buffer.
19017 __ Ld1b(z2.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19018
19019 // Construct a predicate that reflects the number of bytes stored by Stnt1b,
19020 // based on the current VL, and use Sel to obtain a reference vector for
19021 // comparison.
19022 __ Lsr(x2, x1, 3);
19023 __ Whilelo(p5.VnB(), x3, x2);
19024 __ Sel(z3.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19025
19026 // Repeat for larger element sizes.
19027 __ Mov(x4, -4);
19028 __ Index(z30.VnD(), x0, 2);
19029 __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19030 __ Stnt1h(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19031 __ Ld1b(z4.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19032 __ Lsr(x2, x1, 2);
19033 __ Whilelo(p5.VnB(), x3, x2);
19034 __ Sel(z5.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19035
19036 __ Mov(x4, 16);
19037 __ Index(z30.VnD(), x0, 4);
19038 __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19039 __ Stnt1w(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19040 __ Ld1b(z6.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19041 __ Lsr(x2, x1, 1);
19042 __ Whilelo(p5.VnB(), x3, x2);
19043 __ Sel(z7.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19044
19045 __ Mov(x4, -16);
19046 __ Index(z30.VnD(), x0, 8);
19047 __ St1b(z0.VnB(), p0, SVEMemOperand(x0, x4));
19048 __ Stnt1d(z1.VnD(), p0, SVEMemOperand(z30.VnD(), x4));
19049 __ Ld1b(z8.VnB(), p0.Zeroing(), SVEMemOperand(x0, x4));
19050 __ Whilelo(p5.VnB(), x3, x1);
19051 __ Sel(z9.VnB(), p5.Merging(), z1.VnB(), z0.VnB());
19052 END();
19053
19054 if (CAN_RUN()) {
19055 RUN();
19056 ASSERT_EQUAL_SVE(z2, z3);
19057 ASSERT_EQUAL_SVE(z4, z5);
19058 ASSERT_EQUAL_SVE(z6, z7);
19059 ASSERT_EQUAL_SVE(z8, z9);
19060 }
19061}
19062
Martyn Capewellf0844012020-10-23 16:38:26 +010019063TEST_SVE(sve2_while_simple) {
19064 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19065
19066 START();
19067 __ Mov(x0, 1);
19068 __ Mov(x1, 0);
19069 __ Mov(x2, 3);
19070
19071 __ Whilehi(p0.VnB(), x0, x1);
19072 __ Whilehs(p1.VnB(), x0, x1);
19073 __ Whilehi(p2.VnB(), x2, x1);
19074 __ Whilehs(p3.VnB(), x2, x1);
19075 __ Whilehi(p4.VnB(), x2, x0);
19076 __ Whilehs(p5.VnB(), x2, x0);
19077
19078 __ Whilegt(p6.VnB(), x0, x1);
19079 __ Whilege(p7.VnB(), x0, x1);
19080 __ Whilegt(p8.VnB(), x2, x1);
19081 __ Whilege(p9.VnB(), x2, x1);
19082 __ Whilegt(p10.VnB(), x2, x0);
19083 __ Whilege(p11.VnB(), x2, x0);
19084
19085 __ Mov(x4, 0x80000000);
19086 __ Mov(x5, 0x80000001);
19087 __ Whilege(p12.VnB(), w5, w4);
19088 __ Whilegt(p13.VnB(), w5, w4);
19089
19090 __ Mov(x6, 0x8000000000000000);
19091 __ Mov(x7, 0x8000000000000001);
19092 __ Whilege(p14.VnB(), x7, x6);
19093 __ Whilegt(p15.VnB(), x7, x6);
19094
19095 for (int i = 0; i < 16; i++) {
19096 __ Rev(PRegister(i).VnB(), PRegister(i).VnB());
19097 }
19098
19099 END();
19100
19101 if (CAN_RUN()) {
19102 RUN();
19103 int p0_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19104 int p1_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19105 int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19106 int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19107 int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
19108 int p5_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19109 int p6_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19110 int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
19111 int p8_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19112 int p9_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
19113 int p10_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1};
19114 int p11_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1};
19115 int p12_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19116 int p13_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19117 int p14_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19118 int p15_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19119
19120 ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
19121 ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
19122 ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
19123 ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
19124 ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
19125 ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
19126 ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
19127 ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
19128 ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
19129 ASSERT_EQUAL_SVE(p9_exp, p9.VnB());
19130 ASSERT_EQUAL_SVE(p10_exp, p10.VnB());
19131 ASSERT_EQUAL_SVE(p11_exp, p11.VnB());
19132 ASSERT_EQUAL_SVE(p12_exp, p12.VnB());
19133 ASSERT_EQUAL_SVE(p13_exp, p13.VnB());
19134 ASSERT_EQUAL_SVE(p14_exp, p14.VnB());
19135 ASSERT_EQUAL_SVE(p15_exp, p15.VnB());
19136 }
19137}
19138
Martyn Capewell35636df2020-10-15 15:14:12 +010019139TEST_SVE(sve2_whilerw_whilewr_simple) {
19140 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19141
19142 START();
19143 __ Mov(x0, 0);
19144 __ Mov(x1, 1);
19145 __ Mov(x2, 3);
19146
19147 __ Whilerw(p0.VnB(), x0, x0);
19148 __ Whilerw(p1.VnB(), x0, x1);
19149 __ Whilerw(p2.VnB(), x1, x0);
19150
19151 __ Whilewr(p3.VnB(), x0, x0);
19152 __ Whilewr(p4.VnB(), x0, x1);
19153 __ Whilewr(p5.VnB(), x1, x0);
19154
19155 __ Whilewr(p6.VnH(), x1, x1);
19156 __ Whilewr(p7.VnH(), x1, x2);
19157 __ Whilewr(p8.VnH(), x2, x1);
19158
19159 END();
19160
19161 if (CAN_RUN()) {
19162 RUN();
19163 int p0_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19164 ASSERT_EQUAL_SVE(p0_exp, p0.VnB());
19165 int p1_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19166 ASSERT_EQUAL_SVE(p1_exp, p1.VnB());
19167 int p2_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19168 ASSERT_EQUAL_SVE(p2_exp, p2.VnB());
19169 int p3_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19170 ASSERT_EQUAL_SVE(p3_exp, p3.VnB());
19171 int p4_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19172 ASSERT_EQUAL_SVE(p4_exp, p4.VnB());
19173 int p5_exp[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
19174 ASSERT_EQUAL_SVE(p5_exp, p5.VnB());
19175 int p6_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
19176 ASSERT_EQUAL_SVE(p6_exp, p6.VnB());
19177 int p7_exp[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
19178 ASSERT_EQUAL_SVE(p7_exp, p7.VnB());
19179 int p8_exp[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
19180 ASSERT_EQUAL_SVE(p8_exp, p8.VnB());
19181 }
19182}
19183
TatWai Chongada6b352020-11-06 13:48:09 -080019184TEST_SVE(sve2_sqrdcmlah) {
19185 int32_t zn_inputs[] = {-1, -2, -3, -4, 1, 2, 3, 4};
19186 int32_t zm_inputs[] = {-1, -2, 3, 4, 1, 2, -3, -4};
19187 int32_t za_inputs[] = {1, 2, 3, 4, 5, 6, 7, 8};
19188 int32_t zd_000_expected[] =
19189 {1025, 2050, -6141, -8188, 1029, 2054, -6137, -8184};
19190 int32_t zd_090_expected[] =
19191 {1025, -510, -6141, 4612, 1029, -506, -6137, 4616};
19192 int32_t zd_180_expected[] =
Martyn Capewell1b82f172021-01-14 17:55:57 +000019193 {-1023, -2046, 6147, 8196, -1019, -2042, 6151, 8200};
TatWai Chongada6b352020-11-06 13:48:09 -080019194 int32_t zd_270_expected[] =
Martyn Capewell1b82f172021-01-14 17:55:57 +000019195 {-1023, 514, 6147, -4604, -1019, 518, 6151, -4600};
TatWai Chong6b67f6e2020-12-03 23:37:57 -080019196 int32_t zd_0_270_expected[] =
Martyn Capewell1b82f172021-01-14 17:55:57 +000019197 {2049, -1534, 6147, -4604, 2053, -1530, 6151, -4600};
TatWai Chong6b67f6e2020-12-03 23:37:57 -080019198 int32_t zd_3_090_expected[] =
Martyn Capewell1b82f172021-01-14 17:55:57 +000019199 {1025, -510, 3075, -1532, 1029, -506, 3079, -1528};
TatWai Chongada6b352020-11-06 13:48:09 -080019200
19201 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19202 START();
19203
19204 InsrHelper(&masm, z0.VnS(), zn_inputs);
19205 InsrHelper(&masm, z1.VnS(), zm_inputs);
19206 InsrHelper(&masm, z31.VnS(), za_inputs);
19207
19208 // When the value in operands is small, shift left a random value so that it
19209 // can affect the result in destination.
19210 int shift = 20;
19211 __ Lsl(z0.VnS(), z0.VnS(), shift);
19212 __ Lsl(z1.VnS(), z1.VnS(), shift);
19213
19214 __ Mov(z10, z31);
19215 __ Sqrdcmlah(z10.VnS(), z10.VnS(), z0.VnS(), z1.VnS(), 0);
19216
19217 __ Mov(z11, z31);
19218 __ Sqrdcmlah(z11.VnS(), z11.VnS(), z0.VnS(), z1.VnS(), 90);
19219
19220 __ Mov(z12, z31);
19221 __ Sqrdcmlah(z12.VnS(), z12.VnS(), z0.VnS(), z1.VnS(), 180);
19222
19223 __ Mov(z13, z31);
19224 __ Sqrdcmlah(z13.VnS(), z13.VnS(), z0.VnS(), z1.VnS(), 270);
19225
Martyn Capewell13c08b72021-03-11 11:48:52 +000019226 __ Sqrdcmlah(z14.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 0);
19227 __ Sqrdcmlah(z15.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 90);
19228 __ Sqrdcmlah(z16.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 180);
19229 __ Sqrdcmlah(z17.VnS(), z31.VnS(), z0.VnS(), z1.VnS(), 270);
TatWai Chongada6b352020-11-06 13:48:09 -080019230
TatWai Chong6b67f6e2020-12-03 23:37:57 -080019231 __ Mov(z18, z31);
19232 __ Sqrdcmlah(z18.VnS(), z18.VnS(), z0.VnS(), z1.VnS(), 0, 270);
19233
19234 __ Mov(z19, z31);
19235 __ Sqrdcmlah(z19.VnS(), z19.VnS(), z0.VnS(), z1.VnS(), 1, 90);
19236
TatWai Chongada6b352020-11-06 13:48:09 -080019237 END();
19238
19239 if (CAN_RUN()) {
19240 RUN();
19241
19242 ASSERT_EQUAL_SVE(zd_000_expected, z10.VnS());
19243 ASSERT_EQUAL_SVE(zd_090_expected, z11.VnS());
19244 ASSERT_EQUAL_SVE(zd_180_expected, z12.VnS());
19245 ASSERT_EQUAL_SVE(zd_270_expected, z13.VnS());
19246
Martyn Capewell13c08b72021-03-11 11:48:52 +000019247 ASSERT_EQUAL_SVE(z14, z10);
19248 ASSERT_EQUAL_SVE(z15, z11);
19249 ASSERT_EQUAL_SVE(z16, z12);
19250 ASSERT_EQUAL_SVE(z17, z13);
TatWai Chong6b67f6e2020-12-03 23:37:57 -080019251
19252 ASSERT_EQUAL_SVE(zd_0_270_expected, z18.VnS());
19253 ASSERT_EQUAL_SVE(zd_3_090_expected, z19.VnS());
TatWai Chongada6b352020-11-06 13:48:09 -080019254 }
19255}
19256
TatWai Chongcad27c52021-03-10 11:26:50 -080019257TEST_SVE(sve2_sqrdmlah) {
19258 uint16_t zn_inputs_h[] = {0x7ffe, 0x7ffd, 0x7ffd, 0x7ffd, 0x8000,
19259 0x7fff, 0x7ffe, 0x7ffe, 0x8001, 0x8000,
19260 0x7ffd, 0x7ffd, 0x7ffd, 0x5555, 0x5555,
19261 0x5555, 0x8000, 0x8000, 0xaaaa, 0x8001};
19262
19263 uint16_t zm_inputs_h[] = {0x7ffd, 0x7fff, 0x7ffe, 0x7ffd, 0x8001,
19264 0x7fff, 0x7fff, 0x7ffe, 0x8000, 0x8000,
19265 0xaaaa, 0x0001, 0x0001, 0xaaaa, 0xaaaa,
19266 0xcccc, 0x8000, 0x8000, 0x8000, 0x8001};
19267
19268 uint16_t za_inputs_h[] = {0x1010, 0x1010, 0x1010, 0x1010, 0x1010,
19269 0x1010, 0x1010, 0x1010, 0x8000, 0x8011,
19270 0x8006, 0xff7d, 0xfeff, 0xaabc, 0xaabb,
19271 0x9c72, 0x8000, 0x0000, 0x8000, 0xffff};
19272
19273 uint16_t zd_expected_h[] = {0x7fff, 0x7fff, 0x7fff, 0x7fff, 0x7fff,
19274 0x7fff, 0x7fff, 0x7fff, 0xffff, 0x0011,
19275 0x8000, 0xff7e, 0xff00, 0x8000, 0x8000,
19276 0x8000, 0x0000, 0x7fff, 0xd556, 0x7ffd};
19277
19278 uint32_t zn_inputs_s[] = {0x04000000,
19279 0x80000000,
19280 0x04000000,
19281 0x80000000,
19282 0x80000000,
19283 0x80000001,
19284 0x7fffffff,
19285 0x80000000,
19286 0x7ffffffe,
19287 0x7ffffffd,
19288 0x7ffffffd,
19289 0x7ffffffd};
19290
19291 uint32_t zm_inputs_s[] = {0x00000020,
19292 0x80000000,
19293 0x00000010,
19294 0x80000000,
19295 0x7fffffff,
19296 0x80000000,
19297 0x80000000,
19298 0x80000001,
19299 0x7ffffffd,
19300 0x7fffffff,
19301 0x7ffffffe,
19302 0x7ffffffd};
19303
19304 uint32_t za_inputs_s[] = {0x00000000,
19305 0x00000000,
19306 0x00000020,
19307 0x00108000,
19308 0x00000000,
19309 0x00000001,
19310 0x00000000,
19311 0x00000001,
19312 0x10101010,
19313 0x10101010,
19314 0x10101010,
19315 0x10101010};
19316
19317 uint32_t zd_expected_s[] = {0x00000001,
19318 0x7fffffff,
19319 0x00000021,
19320 0x7fffffff,
19321 0x80000001,
19322 0x7fffffff,
19323 0x80000001,
19324 0x7fffffff,
19325 0x7fffffff,
19326 0x7fffffff,
19327 0x7fffffff,
19328 0x7fffffff};
19329
19330 uint64_t zn_inputs_d[] = {0x0400000000000000, 0x8000000000000000,
19331 0x0400000000000000, 0x8000000000000000,
19332 0x8000000000000000, 0x8000000000000001,
19333 0x7fffffffffffffff, 0x8000000000000000,
19334 0x7ffffffffffffffe, 0x7ffffffffffffffd,
19335 0x7ffffffffffffffd, 0x7ffffffffffffffd,
19336 0xf1299accc9186169, 0xd529d2675ee9da21,
19337 0x1a10b5d60b92dcf9, 0xfb1d358e0e6455b1,
19338 0x8eb7721078bdc589, 0x4171509750ded141,
19339 0x8eb7721078bdc589, 0x4171509750ded141};
19340
19341 uint64_t zm_inputs_d[] = {0x0000000000000020, 0x8000000000000000,
19342 0x0000000000000010, 0x8000000000000000,
19343 0x7fffffffffffffff, 0x8000000000000000,
19344 0x8000000000000000, 0x8000000000000001,
19345 0x7ffffffffffffffd, 0x7fffffffffffffff,
19346 0x7ffffffffffffffe, 0x7ffffffffffffffd,
19347 0x30b940efe73f180e, 0x3bc1ff1e52a99b66,
19348 0x40de5c9793535a5e, 0x24752faf47bdddb6,
19349 0x162663016b07e5ae, 0x1de34b56f3d22006,
19350 0x8eb7721078bdc589, 0x4171509750ded141};
19351
19352 uint64_t za_inputs_d[] = {0x0000000000000000, 0x0000000000000000,
19353 0x0000000000000020, 0x0010108000000000,
19354 0x0000000000000000, 0x0000000000000001,
19355 0x0000000000000000, 0x0000000000000001,
19356 0x1010101010101010, 0x1010101010101010,
19357 0x1010101010101010, 0x1010101010101010,
19358 0xb18253371b2c2c77, 0xa70de31e6645eaef,
19359 0xda817198c0318487, 0x9fd9e6b8e04b42ff,
19360 0xced1f6b7119ab197, 0x01ae051a85509b0f,
19361 0x01a211e9352f7927, 0x7667b70a5b13749f};
19362
19363 uint64_t zd_expected_d[] = {0x0000000000000001, 0x7fffffffffffffff,
19364 0x0000000000000021, 0x7fffffffffffffff,
19365 0x8000000000000001, 0x7fffffffffffffff,
19366 0x8000000000000001, 0x7fffffffffffffff,
19367 0x7fffffffffffffff, 0x7fffffffffffffff,
19368 0x7fffffffffffffff, 0x7fffffffffffffff,
19369 0xabdc73dea0d72a35, 0x930e3dc877301966,
19370 0xe7b7145a059f8a9f, 0x9e75a4a9d10cf8af,
19371 0xbb378528642d2581, 0x10f5e6d693ffddf3,
19372 0x65e455a46adc091c, 0x7fffffffffffffff};
19373
19374 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19375 START();
19376
19377 InsrHelper(&masm, z0.VnH(), zn_inputs_h);
19378 InsrHelper(&masm, z1.VnH(), zm_inputs_h);
19379 InsrHelper(&masm, z2.VnH(), za_inputs_h);
19380
19381 __ Sqrdmlah(z2.VnH(), z2.VnH(), z0.VnH(), z1.VnH());
19382
19383 InsrHelper(&masm, z3.VnS(), zn_inputs_s);
19384 InsrHelper(&masm, z4.VnS(), zm_inputs_s);
19385 InsrHelper(&masm, z5.VnS(), za_inputs_s);
19386
19387 __ Sqrdmlah(z5.VnS(), z5.VnS(), z3.VnS(), z4.VnS());
19388
19389 InsrHelper(&masm, z6.VnD(), zn_inputs_d);
19390 InsrHelper(&masm, z7.VnD(), zm_inputs_d);
19391 InsrHelper(&masm, z8.VnD(), za_inputs_d);
19392
19393 __ Sqrdmlah(z8.VnD(), z8.VnD(), z6.VnD(), z7.VnD());
19394
19395 END();
19396
19397 if (CAN_RUN()) {
19398 RUN();
19399 ASSERT_EQUAL_SVE(zd_expected_h, z2.VnH());
19400 ASSERT_EQUAL_SVE(zd_expected_s, z5.VnS());
19401 ASSERT_EQUAL_SVE(zd_expected_d, z8.VnD());
19402 }
19403}
19404
TatWai Chongada6b352020-11-06 13:48:09 -080019405TEST_SVE(sve2_cmla) {
19406 int32_t zn_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
19407 int32_t zm_inputs_s[] = {-2, -4, -6, -8, 2, 4, 6, 8};
19408 int32_t zda_inputs_s[] = {1, 2, 3, 4, 5, 6, 7, 8};
19409 int32_t zd_000_expected[] = {9, 18, 51, 68, 13, 22, 55, 72};
19410 int32_t zd_090_expected[] = {9, -2, 51, -32, 13, 2, 55, -28};
Martyn Capewell1b82f172021-01-14 17:55:57 +000019411 int32_t zd_180_expected[] = {-7, -14, -45, -60, -3, -10, -41, -56};
19412 int32_t zd_270_expected[] = {-7, 6, -45, 40, -3, 10, -41, 44};
TatWai Chongada6b352020-11-06 13:48:09 -080019413
19414 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19415 START();
19416
19417 InsrHelper(&masm, z31.VnS(), zn_inputs_s);
19418 InsrHelper(&masm, z30.VnS(), zm_inputs_s);
19419
19420 InsrHelper(&masm, z0.VnS(), zda_inputs_s);
19421 __ Mov(z29, z0);
19422 __ Cmla(z0.VnS(), z0.VnS(), z31.VnS(), z30.VnS(), 0);
19423
19424 InsrHelper(&masm, z1.VnS(), zda_inputs_s);
19425 __ Mov(z28, z1);
19426 __ Cmla(z1.VnS(), z1.VnS(), z31.VnS(), z30.VnS(), 90);
19427
19428 InsrHelper(&masm, z2.VnS(), zda_inputs_s);
19429 __ Mov(z27, z2);
19430 __ Cmla(z2.VnS(), z2.VnS(), z31.VnS(), z30.VnS(), 180);
19431
19432 InsrHelper(&masm, z3.VnS(), zda_inputs_s);
19433 __ Mov(z26, z3);
19434 __ Cmla(z3.VnS(), z3.VnS(), z31.VnS(), z30.VnS(), 270);
19435
Martyn Capewell13c08b72021-03-11 11:48:52 +000019436 __ Cmla(z4.VnS(), z29.VnS(), z31.VnS(), z30.VnS(), 0);
19437 __ Cmla(z5.VnS(), z28.VnS(), z31.VnS(), z30.VnS(), 90);
19438 __ Cmla(z6.VnS(), z27.VnS(), z31.VnS(), z30.VnS(), 180);
19439 __ Cmla(z7.VnS(), z26.VnS(), z31.VnS(), z30.VnS(), 270);
TatWai Chongada6b352020-11-06 13:48:09 -080019440
19441 END();
19442
19443 if (CAN_RUN()) {
19444 RUN();
19445
19446 ASSERT_EQUAL_SVE(zd_000_expected, z0.VnS());
19447 ASSERT_EQUAL_SVE(zd_090_expected, z1.VnS());
19448 ASSERT_EQUAL_SVE(zd_180_expected, z2.VnS());
19449 ASSERT_EQUAL_SVE(zd_270_expected, z3.VnS());
19450
Martyn Capewell13c08b72021-03-11 11:48:52 +000019451 ASSERT_EQUAL_SVE(z4, z0);
19452 ASSERT_EQUAL_SVE(z5, z1);
19453 ASSERT_EQUAL_SVE(z6, z2);
19454 ASSERT_EQUAL_SVE(z7, z3);
TatWai Chongada6b352020-11-06 13:48:09 -080019455 }
19456}
19457
TatWai Chong5c080292020-12-13 03:57:15 -080019458TEST_SVE(sve2_integer_saturating_multiply_add_long) {
19459 int32_t zn_bottom_inputs[] =
19460 {-2, -4, -6, -8, INT32_MAX, INT32_MIN, INT32_MIN};
19461
19462 int32_t zm_top_inputs[] = {1, 3, 5, 7, INT32_MAX, INT32_MAX, INT32_MIN};
19463
19464 int64_t sqdmlalbt_expected[] = {2,
19465 -19,
19466 -56,
19467 -109,
19468 static_cast<int64_t>(0x7ffffffe00000004),
19469 RawbitsToInt64(0x8000000100000001),
19470 INT64_MAX};
19471
19472 int64_t sqdmlslbt_expected[] = {-2,
19473 19,
19474 56,
19475 109,
19476 RawbitsToInt64(0x80000001fffffffc),
19477 static_cast<int64_t>(0x7ffffffeffffffff),
19478 RawbitsToInt64(0x8000000000000001)};
19479
19480 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19481 START();
19482
19483 InsrHelper(&masm, z31.VnS(), zn_bottom_inputs);
19484 InsrHelper(&masm, z30.VnS(), zm_top_inputs);
19485
19486 __ Dup(z29.VnD(), 0);
19487 __ Zip1(z31.VnS(), z31.VnS(), z29.VnS());
19488 __ Zip1(z30.VnS(), z29.VnS(), z30.VnS());
19489
19490 // Initialise inputs for za.
19491 __ Index(z1.VnD(), 0, 1);
19492 __ Index(z2.VnD(), 0, -1);
19493
19494 __ Sqdmlalbt(z1.VnD(), z1.VnD(), z31.VnS(), z30.VnS());
19495 __ Sqdmlslbt(z2.VnD(), z2.VnD(), z31.VnS(), z30.VnS());
19496
19497 END();
19498
19499 if (CAN_RUN()) {
19500 RUN();
19501
19502 ASSERT_EQUAL_SVE(sqdmlalbt_expected, z1.VnD());
19503 ASSERT_EQUAL_SVE(sqdmlslbt_expected, z2.VnD());
19504 }
19505}
19506
TatWai Chongba9a1482020-10-01 20:25:54 -070019507TEST_SVE(sve2_floating_point_multiply_add_long_vector) {
19508 uint16_t zn_inputs[] = {Float16ToRawbits(Float16(1000)),
19509 Float16ToRawbits(Float16(2000)),
19510 Float16ToRawbits(Float16(0.5)),
19511 Float16ToRawbits(Float16(-0.5)),
19512 Float16ToRawbits(Float16(14)),
19513 Float16ToRawbits(Float16(-14)),
19514 Float16ToRawbits(kFP16PositiveInfinity),
19515 Float16ToRawbits(kFP16NegativeInfinity)};
19516
19517 uint16_t zm_inputs[] = {Float16ToRawbits(Float16(10)),
19518 Float16ToRawbits(Float16(-10)),
19519 Float16ToRawbits(Float16(10)),
19520 Float16ToRawbits(Float16(-10)),
19521 Float16ToRawbits(Float16(10)),
19522 Float16ToRawbits(Float16(-10)),
19523 Float16ToRawbits(Float16(10)),
19524 Float16ToRawbits(Float16(-10))};
19525
19526 uint32_t za_inputs[] = {FloatToRawbits(1.0f),
19527 FloatToRawbits(-1.0f),
19528 FloatToRawbits(1.0f),
19529 FloatToRawbits(-1.0f)};
19530
19531 uint32_t fmlalb_zd_expected[] = {0xc69c3e00, // -19999
19532 0x40800000, // 4
19533 0x430d0000, // 141
19534 FloatToRawbits(kFP32PositiveInfinity)};
19535
19536 uint32_t fmlalt_zd_expected[] = {0x461c4400, // 10001
19537 0x40800000, // 4
19538 0x430d0000, // 141
19539 FloatToRawbits(kFP32PositiveInfinity)};
19540
19541 uint32_t fmlslb_zd_expected[] = {0x469c4200, // 20001
19542 0xc0c00000, // -6
19543 0xc30b0000, // -139
19544 FloatToRawbits(kFP32NegativeInfinity)};
19545
19546 uint32_t fmlslt_zd_expected[] = {0xc61c3c00, // -9999
19547 0xc0c00000, // -6
19548 0xc30b0000, // -139
19549 FloatToRawbits(kFP32NegativeInfinity)};
19550
19551 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19552 START();
19553
19554 InsrHelper(&masm, z31.VnH(), zn_inputs);
19555 InsrHelper(&masm, z30.VnH(), zm_inputs);
19556 InsrHelper(&masm, z29.VnS(), za_inputs);
19557
19558 __ Mov(z0, z29);
19559 __ Fmlalb(z0.VnS(), z0.VnS(), z31.VnH(), z30.VnH());
19560
19561 __ Mov(z1, z29);
19562 __ Fmlalt(z1.VnS(), z1.VnS(), z31.VnH(), z30.VnH());
19563
19564 __ Mov(z2, z29);
19565 __ Fmlslb(z2.VnS(), z2.VnS(), z31.VnH(), z30.VnH());
19566
19567 __ Mov(z3, z29);
19568 __ Fmlslt(z3.VnS(), z3.VnS(), z31.VnH(), z30.VnH());
19569
Martyn Capewell13c08b72021-03-11 11:48:52 +000019570 __ Fmlalb(z4.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19571 __ Fmlalt(z5.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19572 __ Fmlslb(z6.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
19573 __ Fmlslt(z7.VnS(), z29.VnS(), z31.VnH(), z30.VnH());
TatWai Chongba9a1482020-10-01 20:25:54 -070019574
19575 END();
19576
19577 if (CAN_RUN()) {
19578 RUN();
19579
19580 ASSERT_EQUAL_SVE(fmlalb_zd_expected, z0.VnS());
19581 ASSERT_EQUAL_SVE(fmlalt_zd_expected, z1.VnS());
19582 ASSERT_EQUAL_SVE(fmlslb_zd_expected, z2.VnS());
19583 ASSERT_EQUAL_SVE(fmlslt_zd_expected, z3.VnS());
19584
Martyn Capewell13c08b72021-03-11 11:48:52 +000019585 ASSERT_EQUAL_SVE(z4, z0);
19586 ASSERT_EQUAL_SVE(z5, z1);
19587 ASSERT_EQUAL_SVE(z6, z2);
19588 ASSERT_EQUAL_SVE(z7, z3);
TatWai Chongba9a1482020-10-01 20:25:54 -070019589 }
19590}
19591
Martyn Capewell342876b2021-02-24 16:34:33 +000019592TEST_SVE(sve2_flogb_simple) {
19593 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVE2);
19594
19595 START();
19596 __ Ptrue(p0.VnB());
19597 __ Index(z0.VnS(), -4, 1);
19598 __ Mov(z1.VnS(), 0);
19599 __ Mov(z2.VnD(), 0x000fffffffffffff);
19600 __ Mov(z3.VnD(), 0x0010000000000000);
19601 __ Scvtf(z0.VnS(), p0.Merging(), z0.VnS());
19602 __ Scvtf(z1.VnS(), p0.Merging(), z1.VnS());
19603 __ Fdiv(z1.VnS(), p0.Merging(), z0.VnS(), z1.VnS());
19604 __ Flogb(z0.VnS(), p0.Merging(), z0.VnS());
19605 __ Flogb(z1.VnS(), p0.Merging(), z1.VnS());
19606 __ Flogb(z2.VnD(), p0.Merging(), z2.VnD());
19607 __ Flogb(z3.VnD(), p0.Merging(), z3.VnD());
19608 END();
19609
19610 if (CAN_RUN()) {
19611 RUN();
19612 uint64_t expected_z0[] = {0x0000000200000002,
19613 0x0000000200000002,
19614 0x0000000100000001,
19615 0x0000000080000000,
19616 0x0000000000000001,
19617 0x0000000100000002};
19618 ASSERT_EQUAL_SVE(expected_z0, z0.VnD());
19619
19620 uint64_t expected_z1[] = {0x7fffffff7fffffff,
19621 0x7fffffff7fffffff,
19622 0x7fffffff7fffffff,
19623 0x7fffffff80000000,
19624 0x7fffffff7fffffff,
19625 0x7fffffff7fffffff};
19626 ASSERT_EQUAL_SVE(expected_z1, z1.VnD());
19627
19628 uint64_t expected_z2[] = {0xfffffffffffffc01,
19629 0xfffffffffffffc01,
19630 0xfffffffffffffc01,
19631 0xfffffffffffffc01};
19632 ASSERT_EQUAL_SVE(expected_z2, z2.VnD());
19633
19634 uint64_t expected_z3[] = {0xfffffffffffffc02,
19635 0xfffffffffffffc02,
19636 0xfffffffffffffc02,
19637 0xfffffffffffffc02};
19638 ASSERT_EQUAL_SVE(expected_z3, z3.VnD());
19639 }
19640}
Martyn Capewelle642a962021-05-20 17:20:50 +010019641
Martyn Capewell8b1e7392021-06-08 17:32:49 +010019642TEST_SVE(neon_matmul) {
19643 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
19644 CPUFeatures::kSVEI8MM,
19645 CPUFeatures::kNEON,
19646 CPUFeatures::kI8MM);
19647
19648 // Test Neon integer matrix multiply against SVE.
19649 START();
19650 __ Movi(v0.V2D(), 0xffeeddccbbaa9988, 0x77665544332211);
19651 __ Movi(v1.V2D(), 0xaa5555aa55555555, 0x55aaaa55aaaaaa);
19652 __ Movi(v2.V2D(), 0, 0);
19653 __ Movi(v3.V2D(), 0, 0);
19654 __ Movi(v4.V2D(), 0, 0);
19655 __ Movi(v5.V2D(), 0, 0);
19656 __ Movi(v6.V2D(), 0, 0);
19657 __ Movi(v7.V2D(), 0, 0);
19658
19659 __ Smmla(v2.V4S(), v0.V16B(), v1.V16B());
19660 __ Smmla(z3.VnS(), z3.VnS(), z0.VnB(), z1.VnB());
19661 __ Ummla(v4.V4S(), v0.V16B(), v1.V16B());
19662 __ Ummla(z5.VnS(), z5.VnS(), z0.VnB(), z1.VnB());
19663 __ Usmmla(v6.V4S(), v0.V16B(), v1.V16B());
19664 __ Usmmla(z7.VnS(), z7.VnS(), z0.VnB(), z1.VnB());
19665 END();
19666
19667 if (CAN_RUN()) {
19668 RUN();
19669
19670 // The inputs as Z registers are zero beyond the least-significant 128 bits,
19671 // so the Neon and SVE results should be equal for any VL.
19672 ASSERT_EQUAL_SVE(z3, z2);
19673 ASSERT_EQUAL_SVE(z5, z4);
19674 ASSERT_EQUAL_SVE(z7, z6);
19675 }
19676}
19677
Martyn Capewell286dce72021-08-20 13:42:06 +010019678TEST_SVE(sudot_usdot) {
19679 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
19680 CPUFeatures::kSVE2,
19681 CPUFeatures::kSVEI8MM);
19682
19683 START();
19684 __ Ptrue(p0.VnB());
19685 __ Index(z0.VnS(), -424242, 77777);
19686 __ Index(z1.VnB(), 127, -1);
19687 __ Sqabs(z1.VnB(), p0.Merging(), z1.VnB());
19688 __ Index(z2.VnB(), 0, 1);
19689 __ Sqabs(z2.VnB(), p0.Merging(), z2.VnB());
19690 __ Index(z3.VnB(), -128, 1);
19691 __ Mov(z4.VnD(), 0);
19692
19693 // Test Usdot against Udot/Sdot over the range of inputs where they should be
19694 // equal.
19695 __ Usdot(z5.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
19696 __ Udot(z6.VnS(), z0.VnS(), z1.VnB(), z2.VnB());
19697 __ Usdot(z7.VnS(), z0.VnS(), z1.VnB(), z3.VnB());
19698 __ Sdot(z8.VnS(), z0.VnS(), z1.VnB(), z3.VnB());
19699
19700 // Construct values which, when interpreted correctly as signed/unsigned,
19701 // should give a zero result for dot product.
19702 __ Mov(z10.VnS(), 0x8101ff40); // [-127, 1, -1, 64] as signed bytes.
19703 __ Mov(z11.VnS(), 0x02fe8002); // [2, 254, 128, 2] as unsigned bytes.
19704 __ Usdot(z12.VnS(), z4.VnS(), z11.VnB(), z10.VnB());
19705 __ Usdot(z13.VnS(), z4.VnS(), z10.VnB(), z11.VnB());
19706
19707 // Construct a vector with duplicated values across segments. This allows
19708 // testing indexed dot product against the already tested variant.
19709 __ Mov(z14.VnS(), 1);
19710 __ Mul(z15.VnS(), z14.VnS(), z3.VnS(), 1);
19711
19712 __ Usdot(z16.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
19713 __ Usdot(z17.VnS(), z0.VnS(), z3.VnB(), z15.VnB());
19714 __ Sudot(z18.VnS(), z0.VnS(), z3.VnB(), z3.VnB(), 1);
19715 __ Usdot(z19.VnS(), z0.VnS(), z15.VnB(), z3.VnB());
19716 END();
19717
19718 if (CAN_RUN()) {
19719 RUN();
19720 ASSERT_EQUAL_SVE(z6, z5);
19721 ASSERT_EQUAL_SVE(z8, z7);
19722 ASSERT_EQUAL_SVE(z4, z12);
19723
19724 uint64_t z13_expected[] = {0xffff8200ffff8200, 0xffff8200ffff8200};
19725 ASSERT_EQUAL_SVE(z13_expected, z13.VnD());
19726
19727 ASSERT_EQUAL_SVE(z17, z16);
19728 ASSERT_EQUAL_SVE(z19, z18);
19729 }
19730}
19731
mmc28a2cdba9e2024-05-16 15:27:20 +010019732TEST_SVE(neon_ins_zero_high_regression_test) {
19733 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
19734
19735 START();
19736 __ Movi(v0.V2D(), 0x0f0e0d0c0b0a0908, 0x0706050403020100);
19737
19738 // Check that both forms of ins zero bits <VL-1:128>
19739 __ Index(z1.VnB(), 0, 1);
19740 __ Ins(v1.V16B(), 0, wzr);
19741 __ Index(z2.VnB(), 0, 1);
19742 __ Ins(v2.V16B(), 3, v2.V16B(), 3);
19743 END();
19744
19745 if (CAN_RUN()) {
19746 RUN();
19747 ASSERT_EQUAL_SVE(z0, z1);
19748 ASSERT_EQUAL_SVE(z0, z2);
19749 }
19750}
19751
mmc28a5e267962024-05-28 15:54:13 +010019752TEST_SVE(neon_fcvt_zero_high_regression_test) {
19753 SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP,
19754 CPUFeatures::kNEON,
19755 CPUFeatures::kSVE);
19756
19757 START();
19758 __ Mov(z1.VnD(), 0);
19759 __ Mov(z2.VnD(), 0);
19760 __ Mov(z3.VnD(), 0);
19761 __ Mov(z4.VnD(), 0);
19762 __ Mov(z5.VnD(), 0);
19763 __ Mov(z6.VnD(), 0);
19764 __ Mov(z10.VnD(), 0);
19765
19766 Label done;
19767 // Skip calculations for VL128.
19768 __ Rdvl(x0, 1);
19769 __ Cmp(x0, 16);
19770 __ B(eq, &done);
19771
19772 __ Movi(v0.V2D(), 0x3ff000003f800000);
19773 __ Index(z1.VnB(), 0, 1);
19774 __ Index(z2.VnB(), 0, 1);
19775 __ Index(z3.VnB(), 0, 1);
19776 __ Index(z4.VnB(), 0, 1);
19777 __ Index(z5.VnB(), 0, 1);
19778 __ Index(z6.VnB(), 0, 1);
19779
19780 // Test zeroing bits <VL-1:128> for fcvtl, fcvtn and fcvtxn.
19781 __ Fcvtl(v1.V2D(), v0.V2S());
19782 __ Fcvtl2(v2.V2D(), v0.V4S());
19783
19784 __ Fcvtn(v3.V2S(), v0.V2D());
19785 __ Fcvtn2(v4.V4S(), v0.V2D());
19786
19787 __ Fcvtxn(v5.V2S(), v0.V2D());
19788 __ Fcvtxn2(v6.V4S(), v0.V2D());
19789
19790 // Set the expected non-zero bits to zero.
19791 __ Ext(z1.VnB(), z1.VnB(), z10.VnB(), kDRegSizeInBytes * 2);
19792 __ Ext(z2.VnB(), z2.VnB(), z10.VnB(), kDRegSizeInBytes * 2);
19793 __ Ext(z3.VnB(), z3.VnB(), z10.VnB(), kSRegSizeInBytes * 2);
19794 __ Ext(z4.VnB(), z4.VnB(), z10.VnB(), kSRegSizeInBytes * 4);
19795 __ Ext(z5.VnB(), z5.VnB(), z10.VnB(), kSRegSizeInBytes * 2);
19796 __ Ext(z6.VnB(), z6.VnB(), z10.VnB(), kSRegSizeInBytes * 4);
19797
19798 __ Bind(&done);
19799 END();
19800
19801 if (CAN_RUN()) {
19802 RUN();
19803 ASSERT_EQUAL_SVE(z10, z1);
19804 ASSERT_EQUAL_SVE(z10, z2);
19805 ASSERT_EQUAL_SVE(z10, z3);
19806 ASSERT_EQUAL_SVE(z10, z4);
19807 ASSERT_EQUAL_SVE(z10, z5);
19808 ASSERT_EQUAL_SVE(z10, z6);
19809 }
19810}
19811
19812#define TEST_ZEROING(INST) \
19813 __ Index(z0.VnB(), 0, 1); \
19814 __ INST; \
19815 __ Orr(z10.VnB(), z10.VnB(), z0.VnB());
19816
19817TEST_SVE(neon_zero_high) {
19818 SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP,
19819 CPUFeatures::kNEON,
19820 CPUFeatures::kNEONHalf,
19821 CPUFeatures::kSVE,
19822 CPUFeatures::kFcma,
19823 CPUFeatures::kFHM,
19824 CPUFeatures::kFrintToFixedSizedInt,
19825 CPUFeatures::kDotProduct,
19826 CPUFeatures::kRDM,
19827 CPUFeatures::kI8MM);
19828
19829 START();
19830 __ Mov(z10.VnD(), 0); // Initialise cumulative result register.
19831
19832 TEST_ZEROING(Abs(v0.V16B(), v0.V16B()));
19833 TEST_ZEROING(Abs(v0.V2S(), v0.V2S()));
19834 TEST_ZEROING(Add(v0.V16B(), v0.V16B(), v0.V16B()));
19835 TEST_ZEROING(Add(v0.V2S(), v0.V2S(), v0.V2S()));
19836 TEST_ZEROING(Addhn2(v0.V16B(), v0.V8H(), v0.V8H()));
19837 TEST_ZEROING(Addhn(v0.V4H(), v0.V4S(), v0.V4S()));
19838 TEST_ZEROING(Addp(v0.V16B(), v0.V16B(), v0.V16B()));
19839 TEST_ZEROING(Addp(v0.V2S(), v0.V2S(), v0.V2S()));
19840 TEST_ZEROING(And(v0.V16B(), v0.V16B(), v0.V16B()));
19841 TEST_ZEROING(Bic(v0.V8H(), 0, 0));
19842 TEST_ZEROING(Bic(v0.V2S(), 255, 0));
19843 TEST_ZEROING(Bic(v0.V16B(), v0.V16B(), v0.V16B()));
19844 TEST_ZEROING(Bif(v0.V16B(), v0.V16B(), v0.V16B()));
19845 TEST_ZEROING(Bit(v0.V16B(), v0.V16B(), v0.V16B()));
19846 TEST_ZEROING(Bsl(v0.V16B(), v0.V16B(), v0.V16B()));
19847 TEST_ZEROING(Cls(v0.V16B(), v0.V16B()));
19848 TEST_ZEROING(Cls(v0.V2S(), v0.V2S()));
19849 TEST_ZEROING(Clz(v0.V16B(), v0.V16B()));
19850 TEST_ZEROING(Clz(v0.V2S(), v0.V2S()));
19851 TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), 0));
19852 TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), 0));
19853 TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), v0.V16B()));
19854 TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), v0.V2S()));
19855 TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), 0));
19856 TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), 0));
19857 TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), v0.V16B()));
19858 TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), v0.V2S()));
19859 TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), 0));
19860 TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), 0));
19861 TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), v0.V16B()));
19862 TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), v0.V2S()));
19863 TEST_ZEROING(Cmhi(v0.V16B(), v0.V16B(), v0.V16B()));
19864 TEST_ZEROING(Cmhi(v0.V2S(), v0.V2S(), v0.V2S()));
19865 TEST_ZEROING(Cmhs(v0.V16B(), v0.V16B(), v0.V16B()));
19866 TEST_ZEROING(Cmhs(v0.V2S(), v0.V2S(), v0.V2S()));
19867 TEST_ZEROING(Cmle(v0.V16B(), v0.V16B(), 0));
19868 TEST_ZEROING(Cmle(v0.V2S(), v0.V2S(), 0));
19869 TEST_ZEROING(Cmlt(v0.V16B(), v0.V16B(), 0));
19870 TEST_ZEROING(Cmlt(v0.V2S(), v0.V2S(), 0));
19871 TEST_ZEROING(Cmtst(v0.V16B(), v0.V16B(), v0.V16B()));
19872 TEST_ZEROING(Cmtst(v0.V2S(), v0.V2S(), v0.V2S()));
19873 TEST_ZEROING(Cnt(v0.V16B(), v0.V16B()));
19874 TEST_ZEROING(Dup(v0.V2S(), w0));
19875 TEST_ZEROING(Dup(v0.V8B(), w0));
19876 TEST_ZEROING(Dup(v0.V2S(), v0.S(), 0));
19877 TEST_ZEROING(Dup(v0.V8B(), v0.B(), 0));
19878 TEST_ZEROING(Eor(v0.V16B(), v0.V16B(), v0.V16B()));
19879 TEST_ZEROING(Ext(v0.V16B(), v0.V16B(), v0.V16B(), 0));
19880 TEST_ZEROING(Ext(v0.V8B(), v0.V8B(), v0.V8B(), 4));
19881 TEST_ZEROING(Fabd(v0.V4S(), v0.V4S(), v0.V4S()));
19882 TEST_ZEROING(Fabd(v0.V8H(), v0.V8H(), v0.V8H()));
19883 TEST_ZEROING(Fabs(v0.V4S(), v0.V4S()));
19884 TEST_ZEROING(Fabs(v0.V8H(), v0.V8H()));
19885 TEST_ZEROING(Facge(v0.V2S(), v0.V2S(), v0.V2S()));
19886 TEST_ZEROING(Facge(v0.V8H(), v0.V8H(), v0.V8H()));
19887 TEST_ZEROING(Facgt(v0.V2S(), v0.V2S(), v0.V2S()));
19888 TEST_ZEROING(Facgt(v0.V8H(), v0.V8H(), v0.V8H()));
19889 TEST_ZEROING(Fadd(v0.V2S(), v0.V2S(), v0.V2S()));
19890 TEST_ZEROING(Fadd(v0.V8H(), v0.V8H(), v0.V8H()));
19891 TEST_ZEROING(Faddp(v0.V2S(), v0.V2S(), v0.V2S()));
19892 TEST_ZEROING(Faddp(v0.V8H(), v0.V8H(), v0.V8H()));
19893 TEST_ZEROING(Fcadd(v0.V2S(), v0.V2S(), v0.V2S(), 90));
19894 TEST_ZEROING(Fcadd(v0.V8H(), v0.V8H(), v0.V8H(), 90));
19895 TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), 0));
19896 TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), 0));
19897 TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), v0.V2S()));
19898 TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), v0.V8H()));
19899 TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), 0));
19900 TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), 0));
19901 TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), v0.V2S()));
19902 TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), v0.V8H()));
19903 TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), 0));
19904 TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), 0));
19905 TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), v0.V2S()));
19906 TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), v0.V8H()));
19907 TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.H(), 0, 0));
19908 TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.S(), 0, 0));
19909 TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.V4S(), 0));
19910 TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.V4H(), 0));
19911 TEST_ZEROING(Fcmle(v0.V2S(), v0.V2S(), 0));
19912 TEST_ZEROING(Fcmle(v0.V8H(), v0.V8H(), 0));
19913 TEST_ZEROING(Fcmlt(v0.V2S(), v0.V2S(), 0));
19914 TEST_ZEROING(Fcmlt(v0.V8H(), v0.V8H(), 0));
19915 TEST_ZEROING(Fcvtas(v0.V2S(), v0.V2S()));
19916 TEST_ZEROING(Fcvtas(v0.V8H(), v0.V8H()));
19917 TEST_ZEROING(Fcvtau(v0.V2S(), v0.V2S()));
19918 TEST_ZEROING(Fcvtau(v0.V8H(), v0.V8H()));
19919 TEST_ZEROING(Fcvtl2(v0.V4S(), v0.V8H()));
19920 TEST_ZEROING(Fcvtl(v0.V2D(), v0.V2S()));
19921 TEST_ZEROING(Fcvtms(v0.V2S(), v0.V2S()));
19922 TEST_ZEROING(Fcvtms(v0.V8H(), v0.V8H()));
19923 TEST_ZEROING(Fcvtmu(v0.V2S(), v0.V2S()));
19924 TEST_ZEROING(Fcvtmu(v0.V8H(), v0.V8H()));
19925 TEST_ZEROING(Fcvtn2(v0.V8H(), v0.V4S()));
19926 TEST_ZEROING(Fcvtn(v0.V2S(), v0.V2D()));
19927 TEST_ZEROING(Fcvtns(v0.V2S(), v0.V2S()));
19928 TEST_ZEROING(Fcvtns(v0.V8H(), v0.V8H()));
19929 TEST_ZEROING(Fcvtnu(v0.V2S(), v0.V2S()));
19930 TEST_ZEROING(Fcvtnu(v0.V8H(), v0.V8H()));
19931 TEST_ZEROING(Fcvtps(v0.V2S(), v0.V2S()));
19932 TEST_ZEROING(Fcvtps(v0.V8H(), v0.V8H()));
19933 TEST_ZEROING(Fcvtpu(v0.V2S(), v0.V2S()));
19934 TEST_ZEROING(Fcvtpu(v0.V8H(), v0.V8H()));
19935 TEST_ZEROING(Fcvtxn(v0.V2S(), v0.V2D()));
19936 TEST_ZEROING(Fcvtxn2(v0.V4S(), v0.V2D()));
19937 TEST_ZEROING(Fcvtzs(v0.V2S(), v0.V2S()));
19938 TEST_ZEROING(Fcvtzs(v0.V8H(), v0.V8H()));
19939 TEST_ZEROING(Fcvtzs(v0.V2D(), v0.V2D(), 8));
19940 TEST_ZEROING(Fcvtzu(v0.V2S(), v0.V2S()));
19941 TEST_ZEROING(Fcvtzu(v0.V4H(), v0.V4H()));
19942 TEST_ZEROING(Fcvtzu(v0.V2D(), v0.V2D(), 8));
19943 TEST_ZEROING(Fdiv(v0.V2S(), v0.V2S(), v0.V2S()));
19944 TEST_ZEROING(Fdiv(v0.V8H(), v0.V8H(), v0.V8H()));
19945 TEST_ZEROING(Fmax(v0.V2S(), v0.V2S(), v0.V2S()));
19946 TEST_ZEROING(Fmax(v0.V8H(), v0.V8H(), v0.V8H()));
19947 TEST_ZEROING(Fmaxnm(v0.V2S(), v0.V2S(), v0.V2S()));
19948 TEST_ZEROING(Fmaxnm(v0.V8H(), v0.V8H(), v0.V8H()));
19949 TEST_ZEROING(Fmaxnmp(v0.V2S(), v0.V2S(), v0.V2S()));
19950 TEST_ZEROING(Fmaxnmp(v0.V8H(), v0.V8H(), v0.V8H()));
19951 TEST_ZEROING(Fmaxp(v0.V2S(), v0.V2S(), v0.V2S()));
19952 TEST_ZEROING(Fmaxp(v0.V8H(), v0.V8H(), v0.V8H()));
19953 TEST_ZEROING(Fmin(v0.V2S(), v0.V2S(), v0.V2S()));
19954 TEST_ZEROING(Fmin(v0.V8H(), v0.V8H(), v0.V8H()));
19955 TEST_ZEROING(Fminnm(v0.V2S(), v0.V2S(), v0.V2S()));
19956 TEST_ZEROING(Fminnm(v0.V8H(), v0.V8H(), v0.V8H()));
19957 TEST_ZEROING(Fminnmp(v0.V2S(), v0.V2S(), v0.V2S()));
19958 TEST_ZEROING(Fminnmp(v0.V8H(), v0.V8H(), v0.V8H()));
19959 TEST_ZEROING(Fminp(v0.V2S(), v0.V2S(), v0.V2S()));
19960 TEST_ZEROING(Fminp(v0.V8H(), v0.V8H(), v0.V8H()));
19961 TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.S(), 0));
19962 TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.H(), 2));
19963 TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.V4S()));
19964 TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.V4H()));
19965 TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.H(), 0));
19966 TEST_ZEROING(Fmlal2(v0.V2S(), v0.V2H(), v0.H(), 2));
19967 TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.V4H()));
19968 TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.H(), 0));
19969 TEST_ZEROING(Fmlal(v0.V2S(), v0.V2H(), v0.H(), 2));
19970 TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.V4H()));
19971 TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.S(), 0));
19972 TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.H(), 2));
19973 TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.V4S()));
19974 TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.V4H()));
19975 TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.H(), 0));
19976 TEST_ZEROING(Fmlsl2(v0.V2S(), v0.V2H(), v0.H(), 2));
19977 TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.V4H()));
19978 TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.H(), 0));
19979 TEST_ZEROING(Fmlsl(v0.V2S(), v0.V2H(), v0.H(), 2));
19980 TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.V4H()));
19981 TEST_ZEROING(Fmov(v0.V2D(), 2.0000));
19982 TEST_ZEROING(Fmov(v0.V4H(), 2.0000));
19983 TEST_ZEROING(Fmov(v0.D(), 1, x1));
19984 TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.S(), 0));
19985 TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.H(), 2));
19986 TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.V4S()));
19987 TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.V4H()));
19988 TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.S(), 0));
19989 TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.H(), 2));
19990 TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.V4S()));
19991 TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.V4H()));
19992 TEST_ZEROING(Fneg(v0.V4S(), v0.V4S()));
19993 TEST_ZEROING(Fneg(v0.V4H(), v0.V4H()));
19994 TEST_ZEROING(Frecpe(v0.V4S(), v0.V4S()));
19995 TEST_ZEROING(Frecpe(v0.V4H(), v0.V4H()));
19996 TEST_ZEROING(Frecps(v0.V4S(), v0.V4S(), v0.V4S()));
19997 TEST_ZEROING(Frecps(v0.V4H(), v0.V4H(), v0.V4H()));
19998 TEST_ZEROING(Frint32x(v0.V4S(), v0.V4S()));
19999 TEST_ZEROING(Frint32z(v0.V4S(), v0.V4S()));
20000 TEST_ZEROING(Frint64x(v0.V4S(), v0.V4S()));
20001 TEST_ZEROING(Frint64z(v0.V4S(), v0.V4S()));
20002 TEST_ZEROING(Frinta(v0.V4S(), v0.V4S()));
20003 TEST_ZEROING(Frinta(v0.V4H(), v0.V4H()));
20004 TEST_ZEROING(Frinti(v0.V4S(), v0.V4S()));
20005 TEST_ZEROING(Frinti(v0.V4H(), v0.V4H()));
20006 TEST_ZEROING(Frintm(v0.V4S(), v0.V4S()));
20007 TEST_ZEROING(Frintm(v0.V4H(), v0.V4H()));
20008 TEST_ZEROING(Frintn(v0.V4S(), v0.V4S()));
20009 TEST_ZEROING(Frintn(v0.V4H(), v0.V4H()));
20010 TEST_ZEROING(Frintp(v0.V4S(), v0.V4S()));
20011 TEST_ZEROING(Frintp(v0.V4H(), v0.V4H()));
20012 TEST_ZEROING(Frintx(v0.V4S(), v0.V4S()));
20013 TEST_ZEROING(Frintx(v0.V4H(), v0.V4H()));
20014 TEST_ZEROING(Frintz(v0.V4S(), v0.V4S()));
20015 TEST_ZEROING(Frintz(v0.V4H(), v0.V4H()));
20016 TEST_ZEROING(Frsqrte(v0.V4S(), v0.V4S()));
20017 TEST_ZEROING(Frsqrte(v0.V4H(), v0.V4H()));
20018 TEST_ZEROING(Frsqrts(v0.V4S(), v0.V4S(), v0.V4S()));
20019 TEST_ZEROING(Frsqrts(v0.V4H(), v0.V4H(), v0.V4H()));
20020 TEST_ZEROING(Fsqrt(v0.V4S(), v0.V4S()));
20021 TEST_ZEROING(Fsqrt(v0.V4H(), v0.V4H()));
20022 TEST_ZEROING(Fsub(v0.V4S(), v0.V4S(), v0.V4S()));
20023 TEST_ZEROING(Fsub(v0.V4H(), v0.V4H(), v0.V4H()));
20024 TEST_ZEROING(Mov(v0.D(), 0, x0));
20025 TEST_ZEROING(Mov(v0.S(), 0, w0));
20026 TEST_ZEROING(Mov(v0.H(), 0, w0));
20027 TEST_ZEROING(Mov(v0.B(), 0, w0));
20028 TEST_ZEROING(Mov(v0.D(), 0, v0.D(), 0));
20029 TEST_ZEROING(Mov(v0.S(), 0, v0.S(), 0));
20030 TEST_ZEROING(Mov(v0.H(), 0, v0.H(), 0));
20031 TEST_ZEROING(Mov(v0.B(), 0, v0.B(), 0));
20032 TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.S(), 0));
20033 TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.H(), 0));
20034 TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.V4S()));
20035 TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.V4H()));
20036 TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.S(), 0));
20037 TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.H(), 0));
20038 TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.V4S()));
20039 TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.V4H()));
20040 TEST_ZEROING(Movi(v0.V2D(), 0xff));
20041 TEST_ZEROING(Movi(v0.V2S(), 0xff));
20042 TEST_ZEROING(Movi(v0.V4S(), 0x10, LSL, 8));
20043 TEST_ZEROING(Movi(v0.V2S(), 0x10, LSL, 8));
20044 TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.S(), 0));
20045 TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.H(), 0));
20046 TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.V4S()));
20047 TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.V4H()));
20048 TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8));
20049 TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8));
20050 TEST_ZEROING(Neg(v0.V4S(), v0.V4S()));
20051 TEST_ZEROING(Neg(v0.V4H(), v0.V4H()));
20052 TEST_ZEROING(Mvn(v0.V16B(), v0.V16B()));
20053 TEST_ZEROING(Mvn(v0.V8B(), v0.V8B()));
20054 TEST_ZEROING(Orn(v0.V8B(), v0.V8B(), v0.V8B()));
20055 TEST_ZEROING(Orn(v0.V16B(), v0.V16B(), v0.V16B()));
20056 TEST_ZEROING(Orr(v0.V8H(), 0x10, 8));
20057 TEST_ZEROING(Orr(v0.V4H(), 0x10, 8));
20058 TEST_ZEROING(Mov(v0.V8B(), v0.V8B()));
20059 TEST_ZEROING(Mov(v0.V16B(), v0.V16B()));
20060 TEST_ZEROING(Pmul(v0.V16B(), v0.V16B(), v0.V16B()));
20061 TEST_ZEROING(Pmull(v0.V8H(), v0.V8B(), v0.V8B()));
20062 TEST_ZEROING(Pmull2(v0.V8H(), v0.V16B(), v0.V16B()));
20063 TEST_ZEROING(Raddhn2(v0.V16B(), v0.V8H(), v0.V8H()));
20064 TEST_ZEROING(Raddhn(v0.V4H(), v0.V4S(), v0.V4S()));
20065 TEST_ZEROING(Rbit(v0.V8B(), v0.V8B()));
20066 TEST_ZEROING(Rbit(v0.V16B(), v0.V16B()));
20067 TEST_ZEROING(Rsubhn2(v0.V16B(), v0.V8H(), v0.V8H()));
20068 TEST_ZEROING(Rsubhn(v0.V4H(), v0.V4S(), v0.V4S()));
20069 TEST_ZEROING(Saba(v0.V16B(), v0.V16B(), v0.V16B()));
20070 TEST_ZEROING(Saba(v0.V2S(), v0.V2S(), v0.V2S()));
20071 TEST_ZEROING(Saba(v0.V4H(), v0.V4H(), v0.V4H()));
20072 TEST_ZEROING(Sabal2(v0.V8H(), v0.V16B(), v0.V16B()));
20073 TEST_ZEROING(Sabal(v0.V4S(), v0.V4H(), v0.V4H()));
20074 TEST_ZEROING(Sabd(v0.V16B(), v0.V16B(), v0.V16B()));
20075 TEST_ZEROING(Sabd(v0.V2S(), v0.V2S(), v0.V2S()));
20076 TEST_ZEROING(Sabd(v0.V4H(), v0.V4H(), v0.V4H()));
20077 TEST_ZEROING(Sabdl2(v0.V8H(), v0.V16B(), v0.V16B()));
20078 TEST_ZEROING(Sabdl(v0.V4S(), v0.V4H(), v0.V4H()));
20079 TEST_ZEROING(Sadalp(v0.V8H(), v0.V16B()));
20080 TEST_ZEROING(Saddl2(v0.V8H(), v0.V16B(), v0.V16B()));
20081 TEST_ZEROING(Saddl(v0.V2D(), v0.V2S(), v0.V2S()));
20082 TEST_ZEROING(Saddl(v0.V4S(), v0.V4H(), v0.V4H()));
20083 TEST_ZEROING(Saddw2(v0.V8H(), v0.V8H(), v0.V16B()));
20084 TEST_ZEROING(Saddw(v0.V4S(), v0.V4S(), v0.V4H()));
20085 TEST_ZEROING(Scvtf(v0.V4S(), v0.V4S()));
20086 TEST_ZEROING(Scvtf(v0.V8H(), v0.V8H()));
20087 TEST_ZEROING(Scvtf(v0.V2D(), v0.V2D(), 8));
20088 TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
20089 TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.S4B(), 0));
20090 TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.V16B()));
20091 TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.V8B()));
20092 TEST_ZEROING(Shadd(v0.V16B(), v0.V16B(), v0.V16B()));
20093 TEST_ZEROING(Shadd(v0.V4H(), v0.V4H(), v0.V4H()));
20094 TEST_ZEROING(Shl(v0.V2D(), v0.V2D(), 56));
20095 TEST_ZEROING(Shll2(v0.V8H(), v0.V16B(), 8));
20096 TEST_ZEROING(Shll(v0.V2D(), v0.V2S(), 32));
20097 TEST_ZEROING(Shsub(v0.V16B(), v0.V16B(), v0.V16B()));
20098 TEST_ZEROING(Shsub(v0.V4H(), v0.V4H(), v0.V4H()));
20099 TEST_ZEROING(Sli(v0.V2D(), v0.V2D(), 56));
20100 TEST_ZEROING(Sli(v0.V2S(), v0.V2S(), 16));
20101 TEST_ZEROING(Smax(v0.V16B(), v0.V16B(), v0.V16B()));
20102 TEST_ZEROING(Smax(v0.V4H(), v0.V4H(), v0.V4H()));
20103 TEST_ZEROING(Smaxp(v0.V16B(), v0.V16B(), v0.V16B()));
20104 TEST_ZEROING(Smaxp(v0.V4H(), v0.V4H(), v0.V4H()));
20105 TEST_ZEROING(Smin(v0.V16B(), v0.V16B(), v0.V16B()));
20106 TEST_ZEROING(Smin(v0.V4H(), v0.V4H(), v0.V4H()));
20107 TEST_ZEROING(Sminp(v0.V16B(), v0.V16B(), v0.V16B()));
20108 TEST_ZEROING(Sminp(v0.V4H(), v0.V4H(), v0.V4H()));
20109 TEST_ZEROING(Smlal2(v0.V8H(), v0.V16B(), v0.V16B()));
20110 TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.V2S()));
20111 TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.S(), 0));
20112 TEST_ZEROING(Smlsl2(v0.V8H(), v0.V16B(), v0.V16B()));
20113 TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.V2S()));
20114 TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
20115 TEST_ZEROING(Smull2(v0.V8H(), v0.V16B(), v0.V16B()));
20116 TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.V2S()));
20117 TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.S(), 0));
20118 TEST_ZEROING(Sqabs(v0.V16B(), v0.V16B()));
20119 TEST_ZEROING(Sqabs(v0.V4H(), v0.V4H()));
20120 TEST_ZEROING(Sqadd(v0.V16B(), v0.V16B(), v0.V16B()));
20121 TEST_ZEROING(Sqadd(v0.V4H(), v0.V4H(), v0.V4H()));
20122 TEST_ZEROING(Sqdmlal2(v0.V4S(), v0.V8H(), v0.V8H()));
20123 TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.V2S()));
20124 TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.S(), 0));
20125 TEST_ZEROING(Sqdmlsl2(v0.V4S(), v0.V8H(), v0.V8H()));
20126 TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.V2S()));
20127 TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
20128 TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.S(), 0));
20129 TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.H(), 0));
20130 TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.V4S()));
20131 TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.V4H()));
20132 TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.V4S()));
20133 TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.V4H()));
20134 TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.S(), 0));
20135 TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.H(), 0));
20136 TEST_ZEROING(Sqneg(v0.V16B(), v0.V16B()));
20137 TEST_ZEROING(Sqneg(v0.V2S(), v0.V2S()));
20138 TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.S(), 0));
20139 TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.H(), 0));
20140 TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.V4S()));
20141 TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.V4H()));
20142 TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.S(), 0));
20143 TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.H(), 0));
20144 TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.V4S()));
20145 TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.V4H()));
20146 TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.S(), 0));
20147 TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.H(), 0));
20148 TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.V4S()));
20149 TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.V4H()));
20150 TEST_ZEROING(Sqrshl(v0.V16B(), v0.V16B(), v0.V16B()));
20151 TEST_ZEROING(Sqrshl(v0.V4H(), v0.V4H(), v0.V4H()));
20152 TEST_ZEROING(Sqshl(v0.V16B(), v0.V16B(), v0.V16B()));
20153 TEST_ZEROING(Sqshl(v0.V4H(), v0.V4H(), v0.V4H()));
20154 TEST_ZEROING(Sqshl(v0.V2D(), v0.V2D(), 56));
20155 TEST_ZEROING(Sqshl(v0.V2S(), v0.V2S(), 16));
20156 TEST_ZEROING(Sqshlu(v0.V2D(), v0.V2D(), 56));
20157 TEST_ZEROING(Sqshlu(v0.V2S(), v0.V2S(), 16));
20158 TEST_ZEROING(Sqsub(v0.V16B(), v0.V16B(), v0.V16B()));
20159 TEST_ZEROING(Sqsub(v0.V4H(), v0.V4H(), v0.V4H()));
20160 TEST_ZEROING(Sqxtn2(v0.V16B(), v0.V8H()));
20161 TEST_ZEROING(Sqxtn(v0.V2S(), v0.V2D()));
20162 TEST_ZEROING(Sqxtun2(v0.V16B(), v0.V8H()));
20163 TEST_ZEROING(Sqxtun(v0.V2S(), v0.V2D()));
20164 TEST_ZEROING(Srhadd(v0.V16B(), v0.V16B(), v0.V16B()));
20165 TEST_ZEROING(Srhadd(v0.V4H(), v0.V4H(), v0.V4H()));
20166 TEST_ZEROING(Sri(v0.V2D(), v0.V2D(), 8));
20167 TEST_ZEROING(Sri(v0.V2S(), v0.V2S(), 8));
20168 TEST_ZEROING(Srshl(v0.V16B(), v0.V16B(), v0.V16B()));
20169 TEST_ZEROING(Srshl(v0.V4H(), v0.V4H(), v0.V4H()));
20170 TEST_ZEROING(Srshr(v0.V2D(), v0.V2D(), 8));
20171 TEST_ZEROING(Srshr(v0.V2S(), v0.V2S(), 8));
20172 TEST_ZEROING(Srsra(v0.V2D(), v0.V2D(), 8));
20173 TEST_ZEROING(Srsra(v0.V2S(), v0.V2S(), 8));
20174 TEST_ZEROING(Sshl(v0.V16B(), v0.V16B(), v0.V16B()));
20175 TEST_ZEROING(Sshl(v0.V4H(), v0.V4H(), v0.V4H()));
20176 TEST_ZEROING(Sshr(v0.V2D(), v0.V2D(), 8));
20177 TEST_ZEROING(Sshr(v0.V2S(), v0.V2S(), 8));
20178 TEST_ZEROING(Ssra(v0.V2D(), v0.V2D(), 8));
20179 TEST_ZEROING(Ssra(v0.V2S(), v0.V2S(), 8));
20180 TEST_ZEROING(Ssubl2(v0.V8H(), v0.V16B(), v0.V16B()));
20181 TEST_ZEROING(Ssubl(v0.V4S(), v0.V4H(), v0.V4H()));
20182 TEST_ZEROING(Ssubw2(v0.V8H(), v0.V8H(), v0.V16B()));
20183 TEST_ZEROING(Ssubw(v0.V4S(), v0.V4S(), v0.V4H()));
20184 TEST_ZEROING(Sub(v0.V16B(), v0.V16B(), v0.V16B()));
20185 TEST_ZEROING(Sub(v0.V4H(), v0.V4H(), v0.V4H()));
20186 TEST_ZEROING(Subhn2(v0.V16B(), v0.V8H(), v0.V8H()));
20187 TEST_ZEROING(Subhn(v0.V4H(), v0.V4S(), v0.V4S()));
20188 TEST_ZEROING(Sudot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
20189 TEST_ZEROING(Sudot(v0.V2S(), v0.V8B(), v0.S4B(), 2));
20190 TEST_ZEROING(Suqadd(v0.V16B(), v0.V16B()));
20191 TEST_ZEROING(Suqadd(v0.V4H(), v0.V4H()));
20192 TEST_ZEROING(Tbl(v0.V8B(), {v0.V16B()}, v0.V8B()));
20193 TEST_ZEROING(Tbl(v0.V16B(), {v0.V16B()}, v0.V16B()));
20194 TEST_ZEROING(Tbx(v0.V8B(), {v0.V16B()}, v0.V8B()));
20195 TEST_ZEROING(Tbx(v0.V16B(), {v0.V16B()}, v0.V16B()));
20196 TEST_ZEROING(Trn1(v0.V16B(), v0.V16B(), v0.V16B()));
20197 TEST_ZEROING(Trn1(v0.V4H(), v0.V4H(), v0.V4H()));
20198 TEST_ZEROING(Trn2(v0.V16B(), v0.V16B(), v0.V16B()));
20199 TEST_ZEROING(Trn2(v0.V4H(), v0.V4H(), v0.V4H()));
20200 TEST_ZEROING(Uaba(v0.V16B(), v0.V16B(), v0.V16B()));
20201 TEST_ZEROING(Uaba(v0.V4H(), v0.V4H(), v0.V4H()));
20202 TEST_ZEROING(Uabal2(v0.V8H(), v0.V16B(), v0.V16B()));
20203 TEST_ZEROING(Uabal(v0.V4S(), v0.V4H(), v0.V4H()));
20204 TEST_ZEROING(Uabd(v0.V16B(), v0.V16B(), v0.V16B()));
20205 TEST_ZEROING(Uabd(v0.V4H(), v0.V4H(), v0.V4H()));
20206 TEST_ZEROING(Uabdl2(v0.V8H(), v0.V16B(), v0.V16B()));
20207 TEST_ZEROING(Uabdl(v0.V4S(), v0.V4H(), v0.V4H()));
20208 TEST_ZEROING(Uadalp(v0.V8H(), v0.V16B()));
20209 TEST_ZEROING(Uadalp(v0.V2S(), v0.V4H()));
20210 TEST_ZEROING(Uaddl2(v0.V8H(), v0.V16B(), v0.V16B()));
20211 TEST_ZEROING(Uaddl(v0.V4S(), v0.V4H(), v0.V4H()));
20212 TEST_ZEROING(Uaddlp(v0.V8H(), v0.V16B()));
20213 TEST_ZEROING(Uaddlp(v0.V2S(), v0.V4H()));
20214 TEST_ZEROING(Uaddw2(v0.V8H(), v0.V8H(), v0.V16B()));
20215 TEST_ZEROING(Uaddw(v0.V4S(), v0.V4S(), v0.V4H()));
20216 TEST_ZEROING(Ucvtf(v0.V4S(), v0.V4S()));
20217 TEST_ZEROING(Ucvtf(v0.V4H(), v0.V4H()));
20218 TEST_ZEROING(Ucvtf(v0.V2D(), v0.V2D(), 8));
20219 TEST_ZEROING(Ucvtf(v0.V2S(), v0.V2S(), 8));
20220 TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
20221 TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.S4B(), 0));
20222 TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.V8B()));
20223 TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.V16B()));
20224 TEST_ZEROING(Uhadd(v0.V16B(), v0.V16B(), v0.V16B()));
20225 TEST_ZEROING(Uhadd(v0.V4H(), v0.V4H(), v0.V4H()));
20226 TEST_ZEROING(Uhsub(v0.V16B(), v0.V16B(), v0.V16B()));
20227 TEST_ZEROING(Uhsub(v0.V2S(), v0.V2S(), v0.V2S()));
20228 TEST_ZEROING(Umax(v0.V16B(), v0.V16B(), v0.V16B()));
20229 TEST_ZEROING(Umax(v0.V4H(), v0.V4H(), v0.V4H()));
20230 TEST_ZEROING(Umaxp(v0.V16B(), v0.V16B(), v0.V16B()));
20231 TEST_ZEROING(Umaxp(v0.V4H(), v0.V4H(), v0.V4H()));
20232 TEST_ZEROING(Umin(v0.V16B(), v0.V16B(), v0.V16B()));
20233 TEST_ZEROING(Umin(v0.V4H(), v0.V4H(), v0.V4H()));
20234 TEST_ZEROING(Uminp(v0.V16B(), v0.V16B(), v0.V16B()));
20235 TEST_ZEROING(Uminp(v0.V4H(), v0.V4H(), v0.V4H()));
20236 TEST_ZEROING(Umlal2(v0.V8H(), v0.V16B(), v0.V16B()));
20237 TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.V4H()));
20238 TEST_ZEROING(Umlal(v0.V2D(), v0.V2S(), v0.S(), 0));
20239 TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.H(), 0));
20240 TEST_ZEROING(Umlsl2(v0.V8H(), v0.V16B(), v0.V16B()));
20241 TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.V4H()));
20242 TEST_ZEROING(Umlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
20243 TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.H(), 0));
20244 TEST_ZEROING(Umull2(v0.V8H(), v0.V16B(), v0.V16B()));
20245 TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.V4H()));
20246 TEST_ZEROING(Umull(v0.V2D(), v0.V2S(), v0.S(), 0));
20247 TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.H(), 0));
20248 TEST_ZEROING(Uqadd(v0.V16B(), v0.V16B(), v0.V16B()));
20249 TEST_ZEROING(Uqadd(v0.V4H(), v0.V4H(), v0.V4H()));
20250 TEST_ZEROING(Uqrshl(v0.V16B(), v0.V16B(), v0.V16B()));
20251 TEST_ZEROING(Uqrshl(v0.V4H(), v0.V4H(), v0.V4H()));
20252 TEST_ZEROING(Uqshl(v0.V16B(), v0.V16B(), v0.V16B()));
20253 TEST_ZEROING(Uqshl(v0.V4H(), v0.V4H(), v0.V4H()));
20254 TEST_ZEROING(Uqsub(v0.V16B(), v0.V16B(), v0.V16B()));
20255 TEST_ZEROING(Uqsub(v0.V4H(), v0.V4H(), v0.V4H()));
20256 TEST_ZEROING(Uqxtn2(v0.V16B(), v0.V8H()));
20257 TEST_ZEROING(Uqxtn(v0.V2S(), v0.V2D()));
20258 TEST_ZEROING(Urecpe(v0.V2S(), v0.V2S()));
20259 TEST_ZEROING(Urecpe(v0.V4S(), v0.V4S()));
20260 TEST_ZEROING(Urhadd(v0.V16B(), v0.V16B(), v0.V16B()));
20261 TEST_ZEROING(Urhadd(v0.V4H(), v0.V4H(), v0.V4H()));
20262 TEST_ZEROING(Urshl(v0.V16B(), v0.V16B(), v0.V16B()));
20263 TEST_ZEROING(Urshl(v0.V4H(), v0.V4H(), v0.V4H()));
20264 TEST_ZEROING(Urshr(v0.V2D(), v0.V2D(), 8));
20265 TEST_ZEROING(Urshr(v0.V2S(), v0.V2S(), 8));
20266 TEST_ZEROING(Ursqrte(v0.V4S(), v0.V4S()));
20267 TEST_ZEROING(Ursqrte(v0.V2S(), v0.V2S()));
20268 TEST_ZEROING(Ursra(v0.V2D(), v0.V2D(), 8));
20269 TEST_ZEROING(Ursra(v0.V2S(), v0.V2S(), 8));
20270 TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
20271 TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.S4B(), 1));
20272 TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.V16B()));
20273 TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.V8B()));
20274 TEST_ZEROING(Ushl(v0.V16B(), v0.V16B(), v0.V16B()));
20275 TEST_ZEROING(Ushl(v0.V4H(), v0.V4H(), v0.V4H()));
20276 TEST_ZEROING(Ushr(v0.V2D(), v0.V2D(), 8));
20277 TEST_ZEROING(Ushr(v0.V2S(), v0.V2S(), 8));
20278 TEST_ZEROING(Usqadd(v0.V16B(), v0.V16B()));
20279 TEST_ZEROING(Usqadd(v0.V4H(), v0.V4H()));
20280 TEST_ZEROING(Usra(v0.V2D(), v0.V2D(), 8));
20281 TEST_ZEROING(Usra(v0.V2S(), v0.V2S(), 8));
20282 TEST_ZEROING(Usubl2(v0.V8H(), v0.V16B(), v0.V16B()));
20283 TEST_ZEROING(Usubl(v0.V4S(), v0.V4H(), v0.V4H()));
20284 TEST_ZEROING(Usubw2(v0.V8H(), v0.V8H(), v0.V16B()));
20285 TEST_ZEROING(Usubw(v0.V4S(), v0.V4S(), v0.V4H()));
20286 TEST_ZEROING(Uzp1(v0.V16B(), v0.V16B(), v0.V16B()));
20287 TEST_ZEROING(Uzp1(v0.V4H(), v0.V4H(), v0.V4H()));
20288 TEST_ZEROING(Uzp2(v0.V16B(), v0.V16B(), v0.V16B()));
20289 TEST_ZEROING(Uzp2(v0.V4H(), v0.V4H(), v0.V4H()));
20290 TEST_ZEROING(Xtn2(v0.V16B(), v0.V8H()));
20291 TEST_ZEROING(Xtn(v0.V4H(), v0.V4S()));
20292 TEST_ZEROING(Zip1(v0.V16B(), v0.V16B(), v0.V16B()));
20293 TEST_ZEROING(Zip1(v0.V4H(), v0.V4H(), v0.V4H()));
20294 TEST_ZEROING(Zip2(v0.V16B(), v0.V16B(), v0.V16B()));
20295 TEST_ZEROING(Zip2(v0.V4H(), v0.V4H(), v0.V4H()));
20296
20297 __ Mov(z11.VnD(), 0);
20298
20299 Label done, zero_127_to_0;
20300 __ Rdvl(x0, 1);
20301 __ Cmp(x0, 16);
20302 __ B(gt, &zero_127_to_0);
20303
20304 // For 128-bit VL, there's nothing to be tested, so zero the whole register.
20305 __ Mov(z10.VnD(), 0);
20306 __ B(&done);
20307
20308 // Set the expected non-zero bits to zero.
20309 __ Bind(&zero_127_to_0);
20310 __ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2);
20311
20312 __ Bind(&done);
20313
20314 END();
20315
20316 if (CAN_RUN()) {
20317 RUN();
20318 ASSERT_EQUAL_SVE(z11, z10);
20319 }
20320}
20321
20322#undef TEST_ZEROING
20323
20324#define TEST_ZEROING_1(INST) \
20325 __ Index(z0.VnB(), 0, 1); \
20326 __ INST; \
20327 __ Orr(z10.VnB(), z10.VnB(), z0.VnB());
20328#define TEST_ZEROING_2(INST) \
20329 __ Index(z0.VnB(), 0, 1); \
20330 __ Index(z1.VnB(), 0, 1); \
20331 __ INST; \
20332 __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
20333 __ Orr(z10.VnB(), z10.VnB(), z1.VnB());
20334#define TEST_ZEROING_3(INST) \
20335 __ Index(z0.VnB(), 0, 1); \
20336 __ Index(z1.VnB(), 0, 1); \
20337 __ Index(z2.VnB(), 0, 1); \
20338 __ INST; \
20339 __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
20340 __ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \
20341 __ Orr(z10.VnB(), z10.VnB(), z2.VnB());
20342#define TEST_ZEROING_4(INST) \
20343 __ Index(z0.VnB(), 0, 1); \
20344 __ Index(z1.VnB(), 0, 1); \
20345 __ Index(z2.VnB(), 0, 1); \
20346 __ Index(z3.VnB(), 0, 1); \
20347 __ INST; \
20348 __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
20349 __ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \
20350 __ Orr(z10.VnB(), z10.VnB(), z2.VnB()); \
20351 __ Orr(z10.VnB(), z10.VnB(), z3.VnB());
20352
20353TEST_SVE(neon_load_zero_high) {
20354 SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
20355
20356 START();
20357 __ Mov(z10.VnD(), 0); // Initialise cumulative result register.
20358
20359 // Initialise x0 to point to a buffer from which data is loaded. The contents
20360 // does not need to be defined.
20361 int data_size = 4 * kQRegSizeInBytes;
20362 uint8_t* data = new uint8_t[data_size];
20363 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size]));
20364
20365 MemOperand mop = MemOperand(x0);
20366 TEST_ZEROING_1(Ld1(v0.V16B(), mop));
20367 TEST_ZEROING_1(Ld1(v0.V4H(), mop));
20368 TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), mop));
20369 TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), mop));
20370 TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), mop));
20371 TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), mop));
20372 TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
20373 TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
20374 TEST_ZEROING_1(Ld1(v0.B(), 1, mop));
20375 TEST_ZEROING_1(Ld1(v0.D(), 1, mop));
20376 TEST_ZEROING_1(Ld1(v0.H(), 1, mop));
20377 TEST_ZEROING_1(Ld1(v0.S(), 1, mop));
20378 TEST_ZEROING_1(Ld1r(v0.V16B(), mop));
20379 TEST_ZEROING_1(Ld1r(v0.V4H(), mop));
20380 TEST_ZEROING_2(Ld2(v0.V16B(), v1.V16B(), mop));
20381 TEST_ZEROING_2(Ld2(v0.V4H(), v1.V4H(), mop));
20382 TEST_ZEROING_2(Ld2(v0.B(), v1.B(), 1, mop));
20383 TEST_ZEROING_2(Ld2(v0.D(), v1.D(), 1, mop));
20384 TEST_ZEROING_2(Ld2(v0.H(), v1.H(), 1, mop));
20385 TEST_ZEROING_2(Ld2(v0.S(), v1.S(), 1, mop));
20386 TEST_ZEROING_2(Ld2r(v0.V16B(), v1.V16B(), mop));
20387 TEST_ZEROING_2(Ld2r(v0.V4H(), v1.V4H(), mop));
20388 TEST_ZEROING_3(Ld3(v0.V16B(), v1.V16B(), v2.V16B(), mop));
20389 TEST_ZEROING_3(Ld3(v0.V4H(), v1.V4H(), v2.V4H(), mop));
20390 TEST_ZEROING_3(Ld3(v0.B(), v1.B(), v2.B(), 1, mop));
20391 TEST_ZEROING_3(Ld3(v0.D(), v1.D(), v2.D(), 1, mop));
20392 TEST_ZEROING_3(Ld3(v0.H(), v1.H(), v2.H(), 1, mop));
20393 TEST_ZEROING_3(Ld3(v0.S(), v1.S(), v2.S(), 1, mop));
20394 TEST_ZEROING_3(Ld3r(v0.V16B(), v1.V16B(), v2.V16B(), mop));
20395 TEST_ZEROING_3(Ld3r(v0.V4H(), v1.V4H(), v2.V4H(), mop));
20396 TEST_ZEROING_4(Ld4(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
20397 TEST_ZEROING_4(Ld4(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
20398 TEST_ZEROING_4(Ld4(v0.B(), v1.B(), v2.B(), v3.B(), 1, mop));
20399 TEST_ZEROING_4(Ld4(v0.D(), v1.D(), v2.D(), v3.D(), 1, mop));
20400 TEST_ZEROING_4(Ld4(v0.H(), v1.H(), v2.H(), v3.H(), 1, mop));
20401 TEST_ZEROING_4(Ld4(v0.S(), v1.S(), v2.S(), v3.S(), 1, mop));
20402 TEST_ZEROING_4(Ld4r(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
20403 TEST_ZEROING_4(Ld4r(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
20404
20405 __ Mov(z11.VnD(), 0);
20406
20407 Label done, zero_127_to_0;
20408 __ Rdvl(x0, 1);
20409 __ Cmp(x0, 16);
20410 __ B(gt, &zero_127_to_0);
20411
20412 // For 128-bit VL, there's nothing to be tested, so zero the whole register.
20413 __ Mov(z10.VnD(), 0);
20414 __ B(&done);
20415
20416 // Set the expected non-zero bits to zero.
20417 __ Bind(&zero_127_to_0);
20418 __ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2);
20419
20420 __ Bind(&done);
20421
20422 END();
20423
20424 if (CAN_RUN()) {
20425 RUN();
20426 ASSERT_EQUAL_SVE(z11, z10);
20427 }
20428}
20429
20430#undef TEST_ZEROING_1
20431#undef TEST_ZEROING_2
20432#undef TEST_ZEROING_3
20433#undef TEST_ZEROING_4
20434
mmc28a48c8bed2023-02-08 10:22:40 +000020435TEST_SVE(sve_load_store_sp_base_regression_test) {
20436 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
20437 START();
20438
20439 __ Mov(x0, 0);
20440 __ Mov(z0.VnB(), 0);
20441 __ Ptrue(p0.VnB());
20442
20443 Label loop;
20444 __ Mov(x1, 128);
20445 __ Bind(&loop);
20446 __ Push(xzr, xzr);
20447 __ Sub(x1, x1, 1);
20448 __ Cbnz(x1, &loop);
20449
20450 {
20451 ExactAssemblyScope scope(&masm, 193 * kInstructionSize);
20452
20453 __ dci(0xa420a3e0); // ld1b {z0.h}, p0/z, [sp]
20454 __ dci(0xa440a3e0); // ld1b {z0.s}, p0/z, [sp]
20455 __ dci(0xa460a3e0); // ld1b {z0.d}, p0/z, [sp]
20456 __ dci(0xa400a3e0); // ld1b {z0.b}, p0/z, [sp]
20457 __ dci(0xa42043e0); // ld1b {z0.h}, p0/z, [sp, x0]
20458 __ dci(0xa44043e0); // ld1b {z0.s}, p0/z, [sp, x0]
20459 __ dci(0xa46043e0); // ld1b {z0.d}, p0/z, [sp, x0]
20460 __ dci(0xa40043e0); // ld1b {z0.b}, p0/z, [sp, x0]
20461 __ dci(0xc440c3e0); // ld1b {z0.d}, p0/z, [sp, z0.d]
20462 __ dci(0xa5e0a3e0); // ld1d {z0.d}, p0/z, [sp]
20463 __ dci(0xa5e043e0); // ld1d {z0.d}, p0/z, [sp, x0, lsl #3]
20464 __ dci(0xc5e0c3e0); // ld1d {z0.d}, p0/z, [sp, z0.d, lsl #3]
20465 __ dci(0xc5c0c3e0); // ld1d {z0.d}, p0/z, [sp, z0.d]
20466 __ dci(0xa4a0a3e0); // ld1h {z0.h}, p0/z, [sp]
20467 __ dci(0xa4c0a3e0); // ld1h {z0.s}, p0/z, [sp]
20468 __ dci(0xa4e0a3e0); // ld1h {z0.d}, p0/z, [sp]
20469 __ dci(0xa4a043e0); // ld1h {z0.h}, p0/z, [sp, x0, lsl #1]
20470 __ dci(0xa4c043e0); // ld1h {z0.s}, p0/z, [sp, x0, lsl #1]
20471 __ dci(0xa4e043e0); // ld1h {z0.d}, p0/z, [sp, x0, lsl #1]
20472 __ dci(0xc4e0c3e0); // ld1h {z0.d}, p0/z, [sp, z0.d, lsl #1]
20473 __ dci(0xc4c0c3e0); // ld1h {z0.d}, p0/z, [sp, z0.d]
20474 __ dci(0x8440a3e0); // ld1rb {z0.h}, p0/z, [sp]
20475 __ dci(0x8440c3e0); // ld1rb {z0.s}, p0/z, [sp]
20476 __ dci(0x8440e3e0); // ld1rb {z0.d}, p0/z, [sp]
20477 __ dci(0x844083e0); // ld1rb {z0.b}, p0/z, [sp]
20478 __ dci(0x85c0e3e0); // ld1rd {z0.d}, p0/z, [sp]
20479 __ dci(0x84c0a3e0); // ld1rh {z0.h}, p0/z, [sp]
20480 __ dci(0x84c0c3e0); // ld1rh {z0.s}, p0/z, [sp]
20481 __ dci(0x84c0e3e0); // ld1rh {z0.d}, p0/z, [sp]
20482 __ dci(0xa40023e0); // ld1rqb {z0.b}, p0/z, [sp]
20483 __ dci(0xa40003e0); // ld1rqb {z0.b}, p0/z, [sp, x0]
20484 __ dci(0xa58023e0); // ld1rqd {z0.d}, p0/z, [sp]
20485 __ dci(0xa58003e0); // ld1rqd {z0.d}, p0/z, [sp, x0, lsl #3]
20486 __ dci(0xa48023e0); // ld1rqh {z0.h}, p0/z, [sp]
20487 __ dci(0xa48003e0); // ld1rqh {z0.h}, p0/z, [sp, x0, lsl #1]
20488 __ dci(0xa50023e0); // ld1rqw {z0.s}, p0/z, [sp]
20489 __ dci(0xa50003e0); // ld1rqw {z0.s}, p0/z, [sp, x0, lsl #2]
20490 __ dci(0x85c0c3e0); // ld1rsb {z0.h}, p0/z, [sp]
20491 __ dci(0x85c0a3e0); // ld1rsb {z0.s}, p0/z, [sp]
20492 __ dci(0x85c083e0); // ld1rsb {z0.d}, p0/z, [sp]
20493 __ dci(0x8540a3e0); // ld1rsh {z0.s}, p0/z, [sp]
20494 __ dci(0x854083e0); // ld1rsh {z0.d}, p0/z, [sp]
20495 __ dci(0x84c083e0); // ld1rsw {z0.d}, p0/z, [sp]
20496 __ dci(0x8540c3e0); // ld1rw {z0.s}, p0/z, [sp]
20497 __ dci(0x8540e3e0); // ld1rw {z0.d}, p0/z, [sp]
20498 __ dci(0xa5c0a3e0); // ld1sb {z0.h}, p0/z, [sp]
20499 __ dci(0xa5a0a3e0); // ld1sb {z0.s}, p0/z, [sp]
20500 __ dci(0xa580a3e0); // ld1sb {z0.d}, p0/z, [sp]
20501 __ dci(0xa5c043e0); // ld1sb {z0.h}, p0/z, [sp, x0]
20502 __ dci(0xa5a043e0); // ld1sb {z0.s}, p0/z, [sp, x0]
20503 __ dci(0xa58043e0); // ld1sb {z0.d}, p0/z, [sp, x0]
20504 __ dci(0xc44083e0); // ld1sb {z0.d}, p0/z, [sp, z0.d]
20505 __ dci(0xa520a3e0); // ld1sh {z0.s}, p0/z, [sp]
20506 __ dci(0xa500a3e0); // ld1sh {z0.d}, p0/z, [sp]
20507 __ dci(0xa52043e0); // ld1sh {z0.s}, p0/z, [sp, x0, lsl #1]
20508 __ dci(0xa50043e0); // ld1sh {z0.d}, p0/z, [sp, x0, lsl #1]
20509 __ dci(0xc4e083e0); // ld1sh {z0.d}, p0/z, [sp, z0.d, lsl #1]
20510 __ dci(0xc4c083e0); // ld1sh {z0.d}, p0/z, [sp, z0.d]
20511 __ dci(0xa480a3e0); // ld1sw {z0.d}, p0/z, [sp]
20512 __ dci(0xa48043e0); // ld1sw {z0.d}, p0/z, [sp, x0, lsl #2]
20513 __ dci(0xc56083e0); // ld1sw {z0.d}, p0/z, [sp, z0.d, lsl #2]
20514 __ dci(0xc54083e0); // ld1sw {z0.d}, p0/z, [sp, z0.d]
20515 __ dci(0xa540a3e0); // ld1w {z0.s}, p0/z, [sp]
20516 __ dci(0xa560a3e0); // ld1w {z0.d}, p0/z, [sp]
20517 __ dci(0xa54043e0); // ld1w {z0.s}, p0/z, [sp, x0, lsl #2]
20518 __ dci(0xa56043e0); // ld1w {z0.d}, p0/z, [sp, x0, lsl #2]
20519 __ dci(0xc560c3e0); // ld1w {z0.d}, p0/z, [sp, z0.d, lsl #2]
20520 __ dci(0xc540c3e0); // ld1w {z0.d}, p0/z, [sp, z0.d]
20521 __ dci(0xa420e3e0); // ld2b {z0.b, z1.b}, p0/z, [sp]
20522 __ dci(0xa420c3e0); // ld2b {z0.b, z1.b}, p0/z, [sp, x0]
20523 __ dci(0xa5a0e3e0); // ld2d {z0.d, z1.d}, p0/z, [sp]
20524 __ dci(0xa5a0c3e0); // ld2d {z0.d, z1.d}, p0/z, [sp, x0, lsl #3]
20525 __ dci(0xa4a0e3e0); // ld2h {z0.h, z1.h}, p0/z, [sp]
20526 __ dci(0xa4a0c3e0); // ld2h {z0.h, z1.h}, p0/z, [sp, x0, lsl #1]
20527 __ dci(0xa520e3e0); // ld2w {z0.s, z1.s}, p0/z, [sp]
20528 __ dci(0xa520c3e0); // ld2w {z0.s, z1.s}, p0/z, [sp, x0, lsl #2]
20529 __ dci(0xa440e3e0); // ld3b {z0.b, z1.b, z2.b}, p0/z, [sp]
20530 __ dci(0xa440c3e0); // ld3b {z0.b, z1.b, z2.b}, p0/z, [sp, x0]
20531 __ dci(0xa5c0e3e0); // ld3d {z0.d, z1.d, z2.d}, p0/z, [sp]
20532 __ dci(0xa5c0c3e0); // ld3d {z0.d, z1.d, z2.d}, p0/z, [sp, x0, lsl #3]
20533 __ dci(0xa4c0e3e0); // ld3h {z0.h, z1.h, z2.h}, p0/z, [sp]
20534 __ dci(0xa4c0c3e0); // ld3h {z0.h, z1.h, z2.h}, p0/z, [sp, x0, lsl #1]
20535 __ dci(0xa540e3e0); // ld3w {z0.s, z1.s, z2.s}, p0/z, [sp]
20536 __ dci(0xa540c3e0); // ld3w {z0.s, z1.s, z2.s}, p0/z, [sp, x0, lsl #2]
20537 __ dci(0xa460e3e0); // ld4b {z0.b, z1.b, z2.b, z3.b}, p0/z, [sp]
20538 __ dci(0xa460c3e0); // ld4b {z0.b, z1.b, z2.b, z3.b}, p0/z, [sp, x0]
20539 __ dci(0xa5e0e3e0); // ld4d {z0.d, z1.d, z2.d, z3.d}, p0/z, [sp]
20540 __ dci(
20541 0xa5e0c3e0); // ld4d {z0.d, z1.d, z2.d, z3.d}, p0/z, [sp, x0, lsl #3]
20542 __ dci(0xa4e0e3e0); // ld4h {z0.h, z1.h, z2.h, z3.h}, p0/z, [sp]
20543 __ dci(
20544 0xa4e0c3e0); // ld4h {z0.h, z1.h, z2.h, z3.h}, p0/z, [sp, x0, lsl #1]
20545 __ dci(0xa560e3e0); // ld4w {z0.s, z1.s, z2.s, z3.s}, p0/z, [sp]
20546 __ dci(
20547 0xa560c3e0); // ld4w {z0.s, z1.s, z2.s, z3.s}, p0/z, [sp, x0, lsl #2]
20548 __ dci(0xa42063e0); // ldff1b {z0.h}, p0/z, [sp, x0]
20549 __ dci(0xa44063e0); // ldff1b {z0.s}, p0/z, [sp, x0]
20550 __ dci(0xa46063e0); // ldff1b {z0.d}, p0/z, [sp, x0]
20551 __ dci(0xa40063e0); // ldff1b {z0.b}, p0/z, [sp, x0]
20552 __ dci(0xc440e3e0); // ldff1b {z0.d}, p0/z, [sp, z0.d]
20553 __ dci(0xa5e063e0); // ldff1d {z0.d}, p0/z, [sp, x0, lsl #3]
20554 __ dci(0xc5e0e3e0); // ldff1d {z0.d}, p0/z, [sp, z0.d, lsl #3]
20555 __ dci(0xc5c0e3e0); // ldff1d {z0.d}, p0/z, [sp, z0.d]
20556 __ dci(0xa4a063e0); // ldff1h {z0.h}, p0/z, [sp, x0, lsl #1]
20557 __ dci(0xa4c063e0); // ldff1h {z0.s}, p0/z, [sp, x0, lsl #1]
20558 __ dci(0xa4e063e0); // ldff1h {z0.d}, p0/z, [sp, x0, lsl #1]
20559 __ dci(0xc4e0e3e0); // ldff1h {z0.d}, p0/z, [sp, z0.d, lsl #1]
20560 __ dci(0xc4c0e3e0); // ldff1h {z0.d}, p0/z, [sp, z0.d]
20561 __ dci(0xa5c063e0); // ldff1sb {z0.h}, p0/z, [sp, x0]
20562 __ dci(0xa5a063e0); // ldff1sb {z0.s}, p0/z, [sp, x0]
20563 __ dci(0xa58063e0); // ldff1sb {z0.d}, p0/z, [sp, x0]
20564 __ dci(0xc440a3e0); // ldff1sb {z0.d}, p0/z, [sp, z0.d]
20565 __ dci(0xa52063e0); // ldff1sh {z0.s}, p0/z, [sp, x0, lsl #1]
20566 __ dci(0xa50063e0); // ldff1sh {z0.d}, p0/z, [sp, x0, lsl #1]
20567 __ dci(0xc4e0a3e0); // ldff1sh {z0.d}, p0/z, [sp, z0.d, lsl #1]
20568 __ dci(0xc4c0a3e0); // ldff1sh {z0.d}, p0/z, [sp, z0.d]
20569 __ dci(0xa48063e0); // ldff1sw {z0.d}, p0/z, [sp, x0, lsl #2]
20570 __ dci(0xc560a3e0); // ldff1sw {z0.d}, p0/z, [sp, z0.d, lsl #2]
20571 __ dci(0xc540a3e0); // ldff1sw {z0.d}, p0/z, [sp, z0.d]
20572 __ dci(0xa54063e0); // ldff1w {z0.s}, p0/z, [sp, x0, lsl #2]
20573 __ dci(0xa56063e0); // ldff1w {z0.d}, p0/z, [sp, x0, lsl #2]
20574 __ dci(0xc560e3e0); // ldff1w {z0.d}, p0/z, [sp, z0.d, lsl #2]
20575 __ dci(0xc540e3e0); // ldff1w {z0.d}, p0/z, [sp, z0.d]
20576 __ dci(0xa430a3e0); // ldnf1b {z0.h}, p0/z, [sp]
20577 __ dci(0xa450a3e0); // ldnf1b {z0.s}, p0/z, [sp]
20578 __ dci(0xa470a3e0); // ldnf1b {z0.d}, p0/z, [sp]
20579 __ dci(0xa410a3e0); // ldnf1b {z0.b}, p0/z, [sp]
20580 __ dci(0xa5f0a3e0); // ldnf1d {z0.d}, p0/z, [sp]
20581 __ dci(0xa4b0a3e0); // ldnf1h {z0.h}, p0/z, [sp]
20582 __ dci(0xa4d0a3e0); // ldnf1h {z0.s}, p0/z, [sp]
20583 __ dci(0xa4f0a3e0); // ldnf1h {z0.d}, p0/z, [sp]
20584 __ dci(0xa5d0a3e0); // ldnf1sb {z0.h}, p0/z, [sp]
20585 __ dci(0xa5b0a3e0); // ldnf1sb {z0.s}, p0/z, [sp]
20586 __ dci(0xa590a3e0); // ldnf1sb {z0.d}, p0/z, [sp]
20587 __ dci(0xa530a3e0); // ldnf1sh {z0.s}, p0/z, [sp]
20588 __ dci(0xa510a3e0); // ldnf1sh {z0.d}, p0/z, [sp]
20589 __ dci(0xa490a3e0); // ldnf1sw {z0.d}, p0/z, [sp]
20590 __ dci(0xa550a3e0); // ldnf1w {z0.s}, p0/z, [sp]
20591 __ dci(0xa570a3e0); // ldnf1w {z0.d}, p0/z, [sp]
20592 __ dci(0xa400e3e0); // ldnt1b {z0.b}, p0/z, [sp]
20593 __ dci(0xa400c3e0); // ldnt1b {z0.b}, p0/z, [sp, x0]
20594 __ dci(0xa580e3e0); // ldnt1d {z0.d}, p0/z, [sp]
20595 __ dci(0xa580c3e0); // ldnt1d {z0.d}, p0/z, [sp, x0, lsl #3]
20596 __ dci(0xa480e3e0); // ldnt1h {z0.h}, p0/z, [sp]
20597 __ dci(0xa480c3e0); // ldnt1h {z0.h}, p0/z, [sp, x0, lsl #1]
20598 __ dci(0xa500e3e0); // ldnt1w {z0.s}, p0/z, [sp]
20599 __ dci(0xa500c3e0); // ldnt1w {z0.s}, p0/z, [sp, x0, lsl #2]
20600 __ dci(0x858043e0); // ldr z0, [sp]
20601 __ dci(0xe400e3e0); // st1b {z0.b}, p0, [sp]
20602 __ dci(0xe40043e0); // st1b {z0.b}, p0, [sp, x0]
20603 __ dci(0xe400a3e0); // st1b {z0.d}, p0, [sp, z0.d]
20604 __ dci(0xe5e0e3e0); // st1d {z0.d}, p0, [sp]
20605 __ dci(0xe5e043e0); // st1d {z0.d}, p0, [sp, x0, lsl #3]
20606 __ dci(0xe5a0a3e0); // st1d {z0.d}, p0, [sp, z0.d, lsl #3]
20607 __ dci(0xe580a3e0); // st1d {z0.d}, p0, [sp, z0.d]
20608 __ dci(0xe4e0e3e0); // st1h {z0.d}, p0, [sp]
20609 __ dci(0xe4e043e0); // st1h {z0.d}, p0, [sp, x0, lsl #1]
20610 __ dci(0xe4a0a3e0); // st1h {z0.d}, p0, [sp, z0.d, lsl #1]
20611 __ dci(0xe480a3e0); // st1h {z0.d}, p0, [sp, z0.d]
20612 __ dci(0xe560e3e0); // st1w {z0.d}, p0, [sp]
20613 __ dci(0xe56043e0); // st1w {z0.d}, p0, [sp, x0, lsl #2]
20614 __ dci(0xe430e3e0); // st2b {z0.b, z1.b}, p0, [sp]
20615 __ dci(0xe42063e0); // st2b {z0.b, z1.b}, p0, [sp, x0]
20616 __ dci(0xe5b0e3e0); // st2d {z0.d, z1.d}, p0, [sp]
20617 __ dci(0xe5a063e0); // st2d {z0.d, z1.d}, p0, [sp, x0, lsl #3]
20618 __ dci(0xe4b0e3e0); // st2h {z0.h, z1.h}, p0, [sp]
20619 __ dci(0xe4a063e0); // st2h {z0.h, z1.h}, p0, [sp, x0, lsl #1]
20620 __ dci(0xe530e3e0); // st2w {z0.s, z1.s}, p0, [sp]
20621 __ dci(0xe52063e0); // st2w {z0.s, z1.s}, p0, [sp, x0, lsl #2]
20622 __ dci(0xe450e3e0); // st3b {z0.b, z1.b, z2.b}, p0, [sp]
20623 __ dci(0xe44063e0); // st3b {z0.b, z1.b, z2.b}, p0, [sp, x0]
20624 __ dci(0xe5d0e3e0); // st3d {z0.d, z1.d, z2.d}, p0, [sp]
20625 __ dci(0xe5c063e0); // st3d {z0.d, z1.d, z2.d}, p0, [sp, x0, lsl #3]
20626 __ dci(0xe4d0e3e0); // st3h {z0.h, z1.h, z2.h}, p0, [sp]
20627 __ dci(0xe4c063e0); // st3h {z0.h, z1.h, z2.h}, p0, [sp, x0, lsl #1]
20628 __ dci(0xe550e3e0); // st3w {z0.s, z1.s, z2.s}, p0, [sp]
20629 __ dci(0xe54063e0); // st3w {z0.s, z1.s, z2.s}, p0, [sp, x0, lsl #2]
20630 __ dci(0xe470e3e0); // st4b {z0.b, z1.b, z2.b, z3.b}, p0, [sp]
20631 __ dci(0xe46063e0); // st4b {z0.b, z1.b, z2.b, z3.b}, p0, [sp, x0]
20632 __ dci(0xe5f0e3e0); // st4d {z0.d, z1.d, z2.d, z3.d}, p0, [sp]
20633 __ dci(0xe5e063e0); // st4d {z0.d, z1.d, z2.d, z3.d}, p0, [sp, x0, lsl #3]
20634 __ dci(0xe4f0e3e0); // st4h {z0.h, z1.h, z2.h, z3.h}, p0, [sp]
20635 __ dci(0xe4e063e0); // st4h {z0.h, z1.h, z2.h, z3.h}, p0, [sp, x0, lsl #1]
20636 __ dci(0xe570e3e0); // st4w {z0.s, z1.s, z2.s, z3.s}, p0, [sp]
20637 __ dci(0xe56063e0); // st4w {z0.s, z1.s, z2.s, z3.s}, p0, [sp, x0, lsl #2]
20638 __ dci(0xe410e3e0); // stnt1b {z0.b}, p0, [sp]
20639 __ dci(0xe40063e0); // stnt1b {z0.b}, p0, [sp, x0]
20640 __ dci(0xe590e3e0); // stnt1d {z0.d}, p0, [sp]
20641 __ dci(0xe58063e0); // stnt1d {z0.d}, p0, [sp, x0, lsl #3]
20642 __ dci(0xe490e3e0); // stnt1h {z0.h}, p0, [sp]
20643 __ dci(0xe48063e0); // stnt1h {z0.h}, p0, [sp, x0, lsl #1]
20644 __ dci(0xe510e3e0); // stnt1w {z0.s}, p0, [sp]
20645 __ dci(0xe50063e0); // stnt1w {z0.s}, p0, [sp, x0, lsl #2]
20646 __ dci(0x858003e0); // ldr p0, [sp]
20647 __ dci(0xe58003e0); // str p0, [sp]
20648 __ dci(0xe58043e0); // str z0, [sp]
20649 }
20650
Jacob Bramleya01ff902025-01-07 17:06:03 +000020651 __ Drop(128 * 2 * kXRegSizeInBytes);
20652
mmc28a48c8bed2023-02-08 10:22:40 +000020653 END();
20654
20655 if (CAN_RUN()) {
20656 RUN();
20657
20658 // No checks are made here. The test is designed to ensure that the base
20659 // register is interpreted correctly as sp, not xzr. If it is interpreted
20660 // as xzr, the memory access to addresses near zero will fault, and the
20661 // test will fail.
20662 }
20663}
20664
Martyn Capewelle642a962021-05-20 17:20:50 +010020665// Manually constructed simulator test to avoid creating a VL128 variant.
20666
20667#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
Josh Sorefb43d6ef2022-08-03 12:47:14 -040020668void Test_sve_fmatmul(Test* config) {
Martyn Capewelle642a962021-05-20 17:20:50 +010020669 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);
20670
20671 // Only double-precision matrix multiply is tested here. Single-precision is
20672 // tested in the simulator tests using a generated sequence. The (templated)
20673 // code used in the simulator for both cases is the same, which is why the
20674 // tests here don't need to be comprehensive.
20675 START();
20676 Label vl_too_short;
20677 __ Rdvl(x0, 1);
20678 __ Cmp(x0, 32);
20679 __ B(lt, &vl_too_short); // Skip testing VL128.
20680
20681 __ Fdup(z0.VnD(), 1.0);
20682 __ Fdup(z1.VnD(), 2.0);
20683 __ Mov(z2.VnD(), 0);
20684
20685 // Build 2x2 identity matrix in z3.
20686 Label iden_loop;
20687 __ Lsr(x0, x0, 5);
20688 __ Bind(&iden_loop);
20689 __ Insr(z3.VnD(), d0);
20690 __ Insr(z3.VnD(), d2);
20691 __ Insr(z3.VnD(), d2);
20692 __ Insr(z3.VnD(), d0);
20693 __ Sub(x0, x0, 1);
20694 __ Cbnz(x0, &iden_loop);
20695
20696 __ Fmmla(z1.VnD(), z1.VnD(), z0.VnD(), z0.VnD());
20697 __ Fmmla(z2.VnD(), z2.VnD(), z1.VnD(), z3.VnD());
20698
20699 __ Ptrue(p0.VnB());
20700 __ Index(z4.VnD(), -8, 3);
20701 __ Scvtf(z4.VnD(), p0.Merging(), z4.VnD());
20702 __ Mov(z5.VnD(), 0);
20703 __ Fmmla(z4.VnD(), z4.VnD(), z4.VnD(), z4.VnD());
20704 __ Fmmla(z5.VnD(), z5.VnD(), z4.VnD(), z3.VnD());
20705
20706 __ Bind(&vl_too_short);
20707 END();
20708
20709 if (CAN_RUN()) {
20710 RUN();
20711
Martyn Capewellc7b4c082025-05-15 17:14:47 +010020712 if (core.GetSVELaneCount(kDRegSize) >= 4) { // VL256 or longer.
Martyn Capewelle642a962021-05-20 17:20:50 +010020713 ASSERT_EQUAL_SVE(z1, z2);
20714 ASSERT_EQUAL_SVE(z4, z5);
20715
Martyn Capewellc40e2882024-03-22 13:47:46 +000020716 // All results are 4.0:
20717 // z0 z0 z1
20718 // (1 1)(1 1) + (2 2) = (4 4)
20719 // (1 1)(1 1) (2 2) (4 4)
20720 uint64_t z1_expected[] =
20721 {0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20722 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20723 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20724 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20725 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20726 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20727 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20728 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20729 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20730 0x4010000000000000, 0x4010000000000000, 0x4010000000000000,
20731 0x4010000000000000, 0x4010000000000000};
20732 ASSERT_EQUAL_SVE(z1_expected, z1.VnD());
Martyn Capewelle642a962021-05-20 17:20:50 +010020733
Martyn Capewellc40e2882024-03-22 13:47:46 +000020734 // First (highest z4_expected index) multiplications are:
20735 // z4 z4 z4
20736 // (-8 -5)(-8 -2) + (-8 -5) = (81 6)
20737 // (-2 1)(-5 1) (-2 1) ( 9 6)
20738 //
20739 // ( 4 7)( 4 10) + ( 4 7) = ( 69 138)
20740 // (10 13)( 7 13) (10 13) (141 282)
20741 uint64_t z4_expected[] = {
20742 0x40cb690000000000, 0x40c9728000000000, 0x40c9710000000000,
20743 0x40c79e8000000000, 0x40c41f0000000000, 0x40c2708000000000,
20744 0x40c26f0000000000, 0x40c0e48000000000, 0x40bbea0000000000,
20745 0x40b91d0000000000, 0x40b91a0000000000, 0x40b6950000000000,
20746 0x40b1d60000000000, 0x40af320000000000, 0x40af2c0000000000,
20747 0x40ab420000000000, 0x40a4040000000000, 0x40a0aa0000000000,
20748 0x40a0a40000000000, 0x409bb40000000000, 0x4091b80000000000,
20749 0x408a880000000000, 0x408a700000000000, 0x4083c80000000000,
20750 0x4071a00000000000, 0x4061a00000000000, 0x4061400000000000,
20751 0x4051400000000000, 0x4018000000000000, 0x4022000000000000,
20752 0x4018000000000000, 0x4054400000000000,
20753 };
20754 ASSERT_EQUAL_SVE(z4_expected, z4.VnD());
Martyn Capewelle642a962021-05-20 17:20:50 +010020755 }
20756 }
20757}
20758Test* test_sve_fmatmul_list[] =
Josh Sorefb43d6ef2022-08-03 12:47:14 -040020759 {Test::MakeSVETest(256, "AARCH64_ASM_sve_fmatmul_vl256", &Test_sve_fmatmul),
Martyn Capewellc40e2882024-03-22 13:47:46 +000020760 Test::MakeSVETest(512, "AARCH64_ASM_sve_fmatmul_vl512", &Test_sve_fmatmul),
Martyn Capewelle642a962021-05-20 17:20:50 +010020761 Test::MakeSVETest(2048,
20762 "AARCH64_ASM_sve_fmatmul_vl2048",
Josh Sorefb43d6ef2022-08-03 12:47:14 -040020763 &Test_sve_fmatmul)};
Martyn Capewelleecb6072021-07-30 14:08:43 +010020764
Josh Sorefb43d6ef2022-08-03 12:47:14 -040020765void Test_sve_ld1ro(Test* config) {
Martyn Capewelleecb6072021-07-30 14:08:43 +010020766 SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kSVEF64MM);
20767 START();
20768
20769 int data_size = (kQRegSizeInBytes + 128) * 4;
20770 uint8_t* data = new uint8_t[data_size];
20771 for (int i = 0; i < data_size; i++) {
20772 data[i] = i & 0xff;
20773 }
20774
20775 // Set the base to just past half-way through the buffer so we can use
20776 // negative indices.
20777 __ Mov(x0, reinterpret_cast<uintptr_t>(&data[7 + data_size / 2]));
20778
20779 __ Index(z0.VnB(), 0, 1);
20780 __ Ptrue(p0.VnB());
20781 __ Cmplo(p0.VnB(), p0.Zeroing(), z0.VnB(), 4);
20782 __ Pfalse(p1.VnB());
20783 __ Zip1(p1.VnB(), p0.VnB(), p1.VnB());
20784 __ Ptrue(p2.VnB());
20785
20786 __ Mov(x1, -32);
20787 __ Ld1rob(z0.VnB(), p1.Zeroing(), SVEMemOperand(x0, -32));
20788 __ Ld1rob(z1.VnB(), p1.Zeroing(), SVEMemOperand(x0, x1));
20789
20790 __ Mov(x1, 64 / 2);
20791 __ Ld1roh(z2.VnH(), p2.Zeroing(), SVEMemOperand(x0, 64));
20792 __ Ld1roh(z3.VnH(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 1));
20793
20794 __ Mov(x1, -96 / 4);
20795 __ Ld1row(z4.VnS(), p2.Zeroing(), SVEMemOperand(x0, -96));
20796 __ Ld1row(z5.VnS(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 2));
20797
20798 __ Mov(x1, 128 / 8);
20799 __ Ld1rod(z6.VnD(), p2.Zeroing(), SVEMemOperand(x0, 128));
20800 __ Ld1rod(z7.VnD(), p2.Zeroing(), SVEMemOperand(x0, x1, LSL, 3));
20801
20802 // Check that all 256-bit segments match by rotating the vector by one
20803 // segment, eoring, and orring across the vector.
20804 __ Dup(z11.VnQ(), z0.VnQ(), 2);
20805 __ Mov(z8, z0);
20806 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
20807 __ Eor(z8.VnB(), z8.VnB(), z0.VnB());
20808 __ Orv(b9, p2, z8.VnB());
20809
20810 __ Mov(z8, z2);
20811 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
20812 __ Eor(z8.VnB(), z8.VnB(), z2.VnB());
20813 __ Orv(b8, p2, z8.VnB());
20814 __ Orr(z9, z9, z8);
20815
20816 __ Mov(z8, z4);
20817 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
20818 __ Eor(z8.VnB(), z8.VnB(), z4.VnB());
20819 __ Orv(b8, p2, z8.VnB());
20820 __ Orr(z9, z9, z8);
20821
20822 __ Mov(z8, z6);
20823 __ Ext(z8.VnB(), z8.VnB(), z8.VnB(), 32);
20824 __ Eor(z8.VnB(), z8.VnB(), z6.VnB());
20825 __ Orv(b8, p2, z8.VnB());
20826 __ Orr(z9, z9, z8);
20827
20828 END();
20829
20830 if (CAN_RUN()) {
20831 RUN();
20832
20833 int vl = core.GetSVELaneCount(kBRegSize) * 8;
20834 if (vl >= 256) {
20835 ASSERT_EQUAL_SVE(z0, z1);
20836 ASSERT_EQUAL_SVE(z2, z3);
20837 ASSERT_EQUAL_SVE(z4, z5);
20838 ASSERT_EQUAL_SVE(z6, z7);
20839
Martyn Capewellc40e2882024-03-22 13:47:46 +000020840 // Check the result of the rotate/eor sequence.
20841 uint64_t expected_z9[] = {0, 0};
20842 ASSERT_EQUAL_SVE(expected_z9, z9.VnD());
Martyn Capewelleecb6072021-07-30 14:08:43 +010020843 }
20844 }
20845}
20846Test* test_sve_ld1ro_list[] =
Josh Sorefb43d6ef2022-08-03 12:47:14 -040020847 {Test::MakeSVETest(256, "AARCH64_ASM_sve_ld1ro_vl256", &Test_sve_ld1ro),
Martyn Capewellc40e2882024-03-22 13:47:46 +000020848 Test::MakeSVETest(512, "AARCH64_ASM_sve_ld1ro_vl512", &Test_sve_ld1ro),
Josh Sorefb43d6ef2022-08-03 12:47:14 -040020849 Test::MakeSVETest(2048, "AARCH64_ASM_sve_ld1ro_vl2048", &Test_sve_ld1ro)};
Martyn Capewelle642a962021-05-20 17:20:50 +010020850#endif
20851
Jacob Bramleyd77a8e42019-02-12 16:52:24 +000020852} // namespace aarch64
20853} // namespace vixl