blob: 182758052e01026db02559fff04d4cac40e74f8f [file] [log] [blame]
Jacob Bramleyd77a8e42019-02-12 16:52:24 +00001// Copyright 2019, VIXL authors
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are met:
6//
7// * Redistributions of source code must retain the above copyright notice,
8// this list of conditions and the following disclaimer.
9// * Redistributions in binary form must reproduce the above copyright notice,
10// this list of conditions and the following disclaimer in the documentation
11// and/or other materials provided with the distribution.
12// * Neither the name of ARM Limited nor the names of its contributors may be
13// used to endorse or promote products derived from this software without
14// specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27#include <sys/mman.h>
28
29#include <cfloat>
30#include <cmath>
31#include <cstdio>
32#include <cstdlib>
33#include <cstring>
34
35#include "test-runner.h"
36#include "test-utils.h"
37#include "aarch64/test-utils-aarch64.h"
38
39#include "aarch64/cpu-aarch64.h"
40#include "aarch64/disasm-aarch64.h"
41#include "aarch64/macro-assembler-aarch64.h"
42#include "aarch64/simulator-aarch64.h"
43#include "test-assembler-aarch64.h"
44
45namespace vixl {
46namespace aarch64 {
47
Jacob Bramley03c0b512019-02-22 16:42:06 +000048// Call masm->Insr repeatedly to allow test inputs to be set up concisely. This
49// is optimised for call-site clarity, not generated code quality, so it doesn't
50// exist in the MacroAssembler itself.
51//
52// Usage:
53//
54// int values[] = { 42, 43, 44 };
55// InsrHelper(&masm, z0.VnS(), values); // Sets z0.S = { ..., 42, 43, 44 }
56//
57// The rightmost (highest-indexed) array element maps to the lowest-numbered
58// lane.
59template <typename T, size_t N>
60void InsrHelper(MacroAssembler* masm,
61 const ZRegister& zdn,
62 const T (&values)[N]) {
63 for (size_t i = 0; i < N; i++) {
64 masm->Insr(zdn, values[i]);
65 }
66}
67
Jacob Bramley2eaecf12019-05-01 15:46:34 +010068// Conveniently initialise P registers. This is optimised for call-site clarity,
69// not generated code quality.
70//
71// Usage:
72//
73// int values[] = { 0x0, 0x1, 0x2 };
Jacob Bramley2a249922019-05-14 14:40:09 +010074// Initialise(&masm, p0.VnS(), values); // Sets p0 = 0b'0000'0001'0010
Jacob Bramley2eaecf12019-05-01 15:46:34 +010075//
76// The rightmost (highest-indexed) array element maps to the lowest-numbered
77// lane.
78//
79// Each element of the `values` array is mapped onto a lane in `pd`. The
80// architecture only respects the lower bit, and writes zero the upper bits, but
81// other (encodable) values can be specified if required by the test.
82template <typename T, size_t N>
83void Initialise(MacroAssembler* masm,
84 const PRegisterWithLaneSize& pd,
85 const T (&values)[N]) {
86 UseScratchRegisterScope temps(masm);
87 Register temp = temps.AcquireX();
88 Label data;
89 Label done;
90
91 // There is no 'insr' for P registers. The easiest way to initialise one with
92 // an arbitrary value is to load it from a literal pool.
93
94 int p_bits_per_lane = pd.GetLaneSizeInBits() / kZRegBitsPerPRegBit;
95 VIXL_ASSERT((N * p_bits_per_lane) <= kPRegMaxSize);
96 uint64_t p_lane_mask = GetUintMask(p_bits_per_lane);
97
98 // For most lane sizes, each value contributes less than a byte. We need to
99 // pack them into chunks which we can store directly. It's sensible for the
100 // chunk to be the same size as an instruction because we need to pad to an
101 // instruction boundary anyway.
102 typedef Instr Chunk;
103 const size_t kChunkSizeInBits = sizeof(Chunk) * kBitsPerByte;
104 VIXL_ASSERT((kPRegMaxSize % kChunkSizeInBits) == 0);
105 const size_t kPRegMaxSizeInChunks = kPRegMaxSize / kChunkSizeInBits;
106
107 masm->Adr(temp, &data);
108 // TODO: Use `Ldr(pd, MemOperand(temp))` once available.
109 masm->Ldr(PRegister(pd.GetCode()), temp);
110 masm->B(&done);
111 {
112 ExactAssemblyScope total(masm, kPRegMaxSizeInBytes);
113 masm->bind(&data);
114 // Put the last-specified value at the lowest address.
115 int values_index = N - 1;
116 for (size_t c = 0; c < kPRegMaxSizeInChunks; c++) {
117 Chunk chunk = 0;
118 // Whilst we still have values left, use them to populate the chunk.
119 for (size_t chunk_bit = 0;
120 (chunk_bit < kChunkSizeInBits) && (values_index >= 0);
121 chunk_bit += p_bits_per_lane) {
122 Chunk value = values[values_index] & p_lane_mask;
123 VIXL_ASSERT(static_cast<T>(value) == values[values_index]);
124 chunk |= value << chunk_bit;
125 values_index--;
126 }
127 masm->dc(chunk);
128 }
129 }
130 masm->Bind(&done);
131}
132
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000133// Ensure that basic test infrastructure works.
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100134TEST(sve_test_infrastructure_z) {
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000135 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
136 START();
137
Jacob Bramley03c0b512019-02-22 16:42:06 +0000138 __ Mov(x0, 0x0123456789abcdef);
139
140 // Test basic `Insr` behaviour.
141 __ Insr(z0.VnB(), 1);
142 __ Insr(z0.VnB(), 2);
143 __ Insr(z0.VnB(), x0);
144 __ Insr(z0.VnB(), -42);
145 __ Insr(z0.VnB(), 0);
146
147 // Test array inputs.
148 int z1_inputs[] = {3, 4, 5, -42, 0};
149 InsrHelper(&masm, z1.VnH(), z1_inputs);
150
151 // Test that sign-extension works as intended for various lane sizes.
152 __ Dup(z2.VnD(), 0); // Clear the register first.
153 __ Insr(z2.VnB(), -42); // 0xd6
154 __ Insr(z2.VnB(), 0xfe); // 0xfe
155 __ Insr(z2.VnH(), -42); // 0xffd6
156 __ Insr(z2.VnH(), 0xfedc); // 0xfedc
157 __ Insr(z2.VnS(), -42); // 0xffffffd6
158 __ Insr(z2.VnS(), 0xfedcba98); // 0xfedcba98
159 // Use another register for VnD(), so we can support 128-bit Z registers.
160 __ Insr(z3.VnD(), -42); // 0xffffffffffffffd6
161 __ Insr(z3.VnD(), 0xfedcba9876543210); // 0xfedcba9876543210
162
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000163 END();
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000164
Jacob Bramley119bd212019-04-16 10:13:09 +0100165 if (CAN_RUN()) {
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100166 RUN();
Jacob Bramley03c0b512019-02-22 16:42:06 +0000167
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100168 // Test that array checks work properly on a register initialised
169 // lane-by-lane.
170 int z0_inputs_b[] = {0x01, 0x02, 0xef, 0xd6, 0x00};
171 ASSERT_EQUAL_SVE(z0_inputs_b, z0.VnB());
Jacob Bramley03c0b512019-02-22 16:42:06 +0000172
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100173 // Test that lane-by-lane checks work properly on a register initialised
174 // by array.
175 for (size_t i = 0; i < ArrayLength(z1_inputs); i++) {
176 // The rightmost (highest-indexed) array element maps to the
177 // lowest-numbered lane.
178 int lane = static_cast<int>(ArrayLength(z1_inputs) - i - 1);
179 ASSERT_EQUAL_SVE_LANE(z1_inputs[i], z1.VnH(), lane);
Jacob Bramley03c0b512019-02-22 16:42:06 +0000180 }
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100181
182 uint64_t z2_inputs_d[] = {0x0000d6feffd6fedc, 0xffffffd6fedcba98};
183 ASSERT_EQUAL_SVE(z2_inputs_d, z2.VnD());
184 uint64_t z3_inputs_d[] = {0xffffffffffffffd6, 0xfedcba9876543210};
185 ASSERT_EQUAL_SVE(z3_inputs_d, z3.VnD());
Jacob Bramley119bd212019-04-16 10:13:09 +0100186 }
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000187
188 TEARDOWN();
189}
190
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100191// Ensure that basic test infrastructure works.
192TEST(sve_test_infrastructure_p) {
193 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
194 START();
195
196 // Simple cases: move boolean (0 or 1) values.
197
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100198 int p0_inputs[] = {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0};
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100199 Initialise(&masm, p0.VnB(), p0_inputs);
200
201 int p1_inputs[] = {1, 0, 1, 1, 0, 1, 1, 1};
202 Initialise(&masm, p1.VnH(), p1_inputs);
203
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100204 int p2_inputs[] = {1, 1, 0, 1};
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100205 Initialise(&masm, p2.VnS(), p2_inputs);
206
207 int p3_inputs[] = {0, 1};
208 Initialise(&masm, p3.VnD(), p3_inputs);
209
210 // Advanced cases: move numeric value into architecturally-ignored bits.
211
212 // B-sized lanes get one bit in a P register, so there are no ignored bits.
213
214 // H-sized lanes get two bits in a P register.
215 int p4_inputs[] = {0x3, 0x2, 0x1, 0x0, 0x1, 0x2, 0x3};
216 Initialise(&masm, p4.VnH(), p4_inputs);
217
218 // S-sized lanes get four bits in a P register.
219 int p5_inputs[] = {0xc, 0x7, 0x9, 0x6, 0xf};
220 Initialise(&masm, p5.VnS(), p5_inputs);
221
222 // D-sized lanes get eight bits in a P register.
223 int p6_inputs[] = {0x81, 0xcc, 0x55};
224 Initialise(&masm, p6.VnD(), p6_inputs);
225
226 // The largest possible P register has 32 bytes.
227 int p7_inputs[] = {0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
228 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
229 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
230 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f};
231 Initialise(&masm, p7.VnD(), p7_inputs);
232
233 END();
234
235 if (CAN_RUN()) {
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100236 RUN();
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100237
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100238 // Test that lane-by-lane checks work properly. The rightmost
239 // (highest-indexed) array element maps to the lowest-numbered lane.
240 for (size_t i = 0; i < ArrayLength(p0_inputs); i++) {
241 int lane = static_cast<int>(ArrayLength(p0_inputs) - i - 1);
242 ASSERT_EQUAL_SVE_LANE(p0_inputs[i], p0.VnB(), lane);
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100243 }
Jacob Bramley9d06c4d2019-05-13 18:15:06 +0100244 for (size_t i = 0; i < ArrayLength(p1_inputs); i++) {
245 int lane = static_cast<int>(ArrayLength(p1_inputs) - i - 1);
246 ASSERT_EQUAL_SVE_LANE(p1_inputs[i], p1.VnH(), lane);
247 }
248 for (size_t i = 0; i < ArrayLength(p2_inputs); i++) {
249 int lane = static_cast<int>(ArrayLength(p2_inputs) - i - 1);
250 ASSERT_EQUAL_SVE_LANE(p2_inputs[i], p2.VnS(), lane);
251 }
252 for (size_t i = 0; i < ArrayLength(p3_inputs); i++) {
253 int lane = static_cast<int>(ArrayLength(p3_inputs) - i - 1);
254 ASSERT_EQUAL_SVE_LANE(p3_inputs[i], p3.VnD(), lane);
255 }
256
257 // Test that array checks work properly on predicates initialised with a
258 // possibly-different lane size.
259 // 0b...11'10'01'00'01'10'11
260 int p4_expected[] = {0x39, 0x1b};
261 ASSERT_EQUAL_SVE(p4_expected, p4.VnD());
262
263 ASSERT_EQUAL_SVE(p5_inputs, p5.VnS());
264
265 // 0b...10000001'11001100'01010101
266 int p6_expected[] = {2, 0, 0, 1, 3, 0, 3, 0, 1, 1, 1, 1};
267 ASSERT_EQUAL_SVE(p6_expected, p6.VnH());
268
269 // 0b...10011100'10011101'10011110'10011111
270 int p7_expected[] = {1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
271 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1};
272 ASSERT_EQUAL_SVE(p7_expected, p7.VnB());
Jacob Bramley2eaecf12019-05-01 15:46:34 +0100273 }
274}
275
Jacob Bramley22023df2019-05-14 17:55:43 +0100276static void MlaMlsHelper(unsigned lane_size_in_bits) {
277 SETUP_WITH_FEATURES(CPUFeatures::kSVE);
278 START();
279
280 int za_inputs[] = {-39, 1, -3, 2};
281 int zn_inputs[] = {-5, -20, 9, 8};
282 int zm_inputs[] = {9, -5, 4, 5};
283
284 ZRegister zd = z0.WithLaneSize(lane_size_in_bits);
285 ZRegister za = z1.WithLaneSize(lane_size_in_bits);
286 ZRegister zn = z2.WithLaneSize(lane_size_in_bits);
287 ZRegister zm = z3.WithLaneSize(lane_size_in_bits);
288
289 // TODO: Use a simple `Dup` once it accepts arbitrary immediates.
290 __ Mov(w0, 0xdeadbeef);
291 __ Dup(zd.VnS(), w0);
292 InsrHelper(&masm, za, za_inputs);
293 InsrHelper(&masm, zn, zn_inputs);
294 InsrHelper(&masm, zm, zm_inputs);
295
296 int p0_inputs[] = {1, 1, 0, 1};
297 int p1_inputs[] = {1, 0, 1, 1};
298 int p2_inputs[] = {0, 1, 1, 1};
299 int p3_inputs[] = {1, 1, 1, 1};
300
301 Initialise(&masm, p0.WithLaneSize(lane_size_in_bits), p0_inputs);
302 Initialise(&masm, p1.WithLaneSize(lane_size_in_bits), p1_inputs);
303 Initialise(&masm, p2.WithLaneSize(lane_size_in_bits), p2_inputs);
304 Initialise(&masm, p3.WithLaneSize(lane_size_in_bits), p3_inputs);
305
306 // The Mla macro automatically selects between mla, mad and movprfx + mla
307 // based on what registers are aliased.
308 ZRegister mla_da_result = z10.WithLaneSize(lane_size_in_bits);
309 ZRegister mla_dn_result = z11.WithLaneSize(lane_size_in_bits);
310 ZRegister mla_dm_result = z12.WithLaneSize(lane_size_in_bits);
311
312 __ Mov(mla_da_result, za);
313 __ Mla(mla_da_result, p0.Merging(), mla_da_result, zn, zm);
314
315 __ Mov(mla_dn_result, zn);
316 __ Mla(mla_dn_result, p1.Merging(), za, mla_dn_result, zm);
317
318 __ Mov(mla_dm_result, zm);
319 __ Mla(mla_dm_result, p2.Merging(), za, zn, mla_dm_result);
320
321 // TODO: Enable once movprfx is implemented.
322 // __ Mla(mla_d_result, p3.Merging(), za, zn, zm);
323
324 // The Mls macro automatically selects between mls, msb and movprfx + mls
325 // based on what registers are aliased.
326 ZRegister mls_da_result = z20.WithLaneSize(lane_size_in_bits);
327 ZRegister mls_dn_result = z21.WithLaneSize(lane_size_in_bits);
328 ZRegister mls_dm_result = z22.WithLaneSize(lane_size_in_bits);
329
330 __ Mov(mls_da_result, za);
331 __ Mls(mls_da_result, p0.Merging(), mls_da_result, zn, zm);
332
333 __ Mov(mls_dn_result, zn);
334 __ Mls(mls_dn_result, p1.Merging(), za, mls_dn_result, zm);
335
336 __ Mov(mls_dm_result, zm);
337 __ Mls(mls_dm_result, p2.Merging(), za, zn, mls_dm_result);
338
339 // TODO: Enable once movprfx is implemented.
340 // __ Mls(mls_d_result, p3.Merging(), za, zn, zm);
341
342 END();
343
344 if (CAN_RUN()) {
345 RUN();
346
347 ASSERT_EQUAL_SVE(za_inputs, z1.WithLaneSize(lane_size_in_bits));
348 ASSERT_EQUAL_SVE(zn_inputs, z2.WithLaneSize(lane_size_in_bits));
349 ASSERT_EQUAL_SVE(zm_inputs, z3.WithLaneSize(lane_size_in_bits));
350
351 int mla[] = {-84, 101, 33, 42};
352 int mls[] = {6, -99, -39, -38};
353
354 int mla_da_expected[] = {mla[0], mla[1], za_inputs[2], mla[3]};
355 ASSERT_EQUAL_SVE(mla_da_expected, mla_da_result);
356
357 int mla_dn_expected[] = {mla[0], zn_inputs[1], mla[2], mla[3]};
358 ASSERT_EQUAL_SVE(mla_dn_expected, mla_dn_result);
359
360 int mla_dm_expected[] = {zm_inputs[0], mla[1], mla[2], mla[3]};
361 ASSERT_EQUAL_SVE(mla_dm_expected, mla_dm_result);
362
363 // TODO: Enable once movprfx is implemented.
364 // ASSERT_EQUAL_SVE(mla, mla_d_result);
365
366 int mls_da_expected[] = {mls[0], mls[1], za_inputs[2], mls[3]};
367 ASSERT_EQUAL_SVE(mls_da_expected, mls_da_result);
368
369 int mls_dn_expected[] = {mls[0], zn_inputs[1], mls[2], mls[3]};
370 ASSERT_EQUAL_SVE(mls_dn_expected, mls_dn_result);
371
372 int mls_dm_expected[] = {zm_inputs[0], mls[1], mls[2], mls[3]};
373 ASSERT_EQUAL_SVE(mls_dm_expected, mls_dm_result);
374
375 // TODO: Enable once movprfx is implemented.
376 // ASSERT_EQUAL_SVE(mls, mls_d_result);
377 }
378}
379
380TEST(sve_mla_mls_b) { MlaMlsHelper(kBRegSize); }
381TEST(sve_mla_mls_h) { MlaMlsHelper(kHRegSize); }
382TEST(sve_mla_mls_s) { MlaMlsHelper(kSRegSize); }
383TEST(sve_mla_mls_d) { MlaMlsHelper(kDRegSize); }
384
Jacob Bramleyd77a8e42019-02-12 16:52:24 +0000385} // namespace aarch64
386} // namespace vixl