aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIlia Taraban <ilia.taraban@intel.com>2019-01-10 12:34:17 +0000
committerIlia Taraban <ilia.taraban@intel.com>2019-01-10 12:34:17 +0000
commit88815825d21319328cf75f54f8d2c1c45a72e407 (patch)
treec6ec1b959262c5d914ee11398aafae2461f4a788
parente259b793a57df68cb0e97436441563533d8f745b (diff)
test-suite: add avx512 tests with move-load-store intrinsics
Summary: Here is the next bunch of avx512 tests. In each test we do some load, store or move operations and also check result. Reviewers: MatzeB, craig.topper, zvi, RKSimon Reviewed By: RKSimon Subscribers: mgorny, llvm-commits Differential Revision: https://reviews.llvm.org/D51599 git-svn-id: https://llvm.org/svn/llvm-project/test-suite/trunk@350816 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BW/CMakeLists.txt5
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BW/Makefile11
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BW/load_store.c267
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BW/load_store.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.c234
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.c208
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BWVL/sets.c316
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BWVL/sets.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.c1253
-rw-r--r--SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512DQVL/CMakeLists.txt5
-rw-r--r--SingleSource/UnitTests/Vector/AVX512DQVL/Makefile11
-rw-r--r--SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.c234
-rw-r--r--SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.c147
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.c134
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.c148
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.c132
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.c479
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/mask_mov.c135
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/mask_mov.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/movedup.c213
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/movedup.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/store.c144
-rw-r--r--SingleSource/UnitTests/Vector/AVX512F/store.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/CMakeLists.txt5
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/Makefile11
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.c135
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.c143
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.c136
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.c141
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c137
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.c139
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.c139
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.c135
-rw-r--r--SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.reference_output2
-rw-r--r--SingleSource/UnitTests/Vector/CMakeLists.txt3
-rw-r--r--SingleSource/UnitTests/Vector/Makefile13
52 files changed, 5257 insertions, 0 deletions
diff --git a/SingleSource/UnitTests/Vector/AVX512BW/CMakeLists.txt b/SingleSource/UnitTests/Vector/AVX512BW/CMakeLists.txt
new file mode 100644
index 00000000..5bc1341a
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BW/CMakeLists.txt
@@ -0,0 +1,5 @@
+list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR})
+list(APPEND LDFLAGS -lm)
+list(APPEND CFLAGS "-march=${X86CPU_ARCH}")
+list(APPEND CFLAGS -fms-extensions)
+llvm_singlesource(PREFIX "Vector-AVX512BW-")
diff --git a/SingleSource/UnitTests/Vector/AVX512BW/Makefile b/SingleSource/UnitTests/Vector/AVX512BW/Makefile
new file mode 100644
index 00000000..997559d6
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BW/Makefile
@@ -0,0 +1,11 @@
+# SingleSource/UnitTests/Vector/AVX512BW/Makefile
+
+DIRS =
+LEVEL = ../../../..
+CFLAGS += -fms-extensions -march=native -mavx512bw -I${SourceDir}/..
+LDFLAGS += -lm
+
+include $(LEVEL)/SingleSource/Makefile.singlesrc
+
+TARGET_FLAGS += -march=native -mavx512bw
+LCCFLAGS += -march=native -mavx512bw
diff --git a/SingleSource/UnitTests/Vector/AVX512BW/load_store.c b/SingleSource/UnitTests/Vector/AVX512BW/load_store.c
new file mode 100644
index 00000000..1d29329a
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BW/load_store.c
@@ -0,0 +1,267 @@
+/*
+ * Test load and store instructions.
+ * Here we check for _mm512_[mask|maskz]_[loadu|storeu] intrinsics.
+ */
+#include "m512_test_util.h"
+
+V512 src_vals[2];
+V512 all_ones;
+volatile int vol0 = 0;
+
+void NOINLINE init() {
+ volatile int i;
+ int j;
+
+ for (i = 0; i < sizeof(src_vals) / sizeof(src_vals[0]); i++) {
+ for (j = 0; j < 16; j++) {
+ src_vals[i].s32[j] = 16 * i + j;
+ }
+ }
+
+ for (i = 0; i < 16; i++) {
+ all_ones.s32[i] = -1;
+ }
+}
+
+void NOINLINE do_loadu() {
+ V512 res;
+ V512 expected;
+ __mmask64 k64 = 0xfbde79feffeeffee;
+ __mmask32 k32 = 0xbfde79fe;
+ __mmask16 k16 = 0xbfde;
+ __mmask8 k8 = 0xaf;
+ volatile int i;
+ signed char *p8 = &src_vals[0].s8[0];
+ short *p16 = &src_vals[0].s16[0];
+ int *p = &src_vals[0].s32[0];
+ __int64 *p64 = &src_vals[0].s64[0];
+
+ res.zmm = _mm512_loadu_ps(&src_vals[0].s32[1]);
+ for (i = 0; i < 16; i++) {
+ expected.s32[i] = p[i + 1];
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_loadu_ps", __LINE__);
+
+ res.zmmd = _mm512_loadu_pd(&src_vals[0].s32[2]);
+ for (i = 0; i < 16; i++) {
+ expected.s32[i] = p[i + 2];
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_loadu_pd", __LINE__);
+
+ res.zmmi = _mm512_loadu_si512(&src_vals[0].s32[3]);
+ for (i = 0; i < 16; i++) {
+ expected.s32[i] = p[i + 3];
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_loadu_si512", __LINE__);
+
+ /* Now the write-masked versions. */
+
+ res = all_ones;
+ expected = all_ones;
+ res.zmm = _mm512_mask_loadu_ps(res.zmm, k16, &src_vals[0].s32[5]);
+ for (i = 0; i < 16; i++) {
+ if ((1 << i) & k16) {
+ expected.s32[i] = p[i + 5];
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_ps", __LINE__);
+
+ k64 += vol0;
+ res = all_ones;
+ expected = all_ones;
+ res.zmmi = _mm512_mask_loadu_epi8(res.zmmi, k64, &src_vals[0].s8[7]);
+ for (i = 0; i < 64; i++) {
+ if (((__mmask64)1 << i) & k64) {
+ expected.s8[i] = p8[i + 7];
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_epi8", __LINE__);
+
+ k64 += vol0;
+ res = all_ones;
+ expected.zmmi = _mm512_setzero_epi32();
+ res.zmmi = _mm512_maskz_loadu_epi8(k64, &src_vals[0].s8[9]);
+ for (i = 0; i < 64; i++) {
+ if (((__mmask64)1 << i) & k64) {
+ expected.s8[i] = p8[i + 9];
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_loadu_epi8", __LINE__);
+
+ k32 += vol0;
+ res = all_ones;
+ expected = all_ones;
+ res.zmmi = _mm512_mask_loadu_epi16(res.zmmi, k32, &src_vals[0].s16[5]);
+ for (i = 0; i < 32; i++) {
+ if ((1 << i) & k32) {
+ expected.s16[i] = p16[i + 5];
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_epi16", __LINE__);
+
+ k32 += vol0;
+ res = all_ones;
+ expected.zmmi = _mm512_setzero_epi32();
+ res.zmmi = _mm512_maskz_loadu_epi16(k32, &src_vals[0].s16[3]);
+ for (i = 0; i < 32; i++) {
+ if ((1 << i) & k32) {
+ expected.s16[i] = p16[i + 3];
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_loadu_epi16", __LINE__);
+
+ k16 = 0xabcd + vol0;
+ res = all_ones;
+ expected = all_ones;
+ res.zmmi = _mm512_mask_loadu_epi32(res.zmmi, k16, &src_vals[0].s32[7]);
+ for (i = 0; i < 16; i++) {
+ if ((1 << i) & k16) {
+ expected.s32[i] = p[i + 7];
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_epi32", __LINE__);
+
+ res = all_ones;
+ expected = all_ones;
+ res.zmmd = _mm512_mask_loadu_pd(res.zmmd, k8, &src_vals[0].s64[2]);
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k8) {
+ expected.s64[i] = p64[i + 2];
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_pd", __LINE__);
+
+ k8 = 0x79 + vol0;
+ res = all_ones;
+ expected = all_ones;
+ res.zmmi = _mm512_mask_loadu_epi64(res.zmmi, k8, &src_vals[0].s64[3]);
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k8) {
+ expected.s64[i] = p64[i + 3];
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_loadu_epi64", __LINE__);
+}
+
+void NOINLINE do_storeu() {
+ V512 src;
+ V512 expected;
+ volatile int i;
+ static V512 dst_vals[2];
+ __mmask64 k64 = 0xabcdffffffffeebd;
+ __mmask32 k32 = 0xfefebdbd;
+ __mmask16 k16 = 0x79ab;
+ __mmask8 k8 = 0xea;
+
+ src.zmmi = src_vals[0].zmmi;
+
+ dst_vals[0].zmm = _mm512_setzero_ps();
+ dst_vals[1].zmm = _mm512_setzero_ps();
+ _mm512_storeu_si512(&dst_vals[0].s32[1], src.zmmi);
+ check_equal_nd(&dst_vals[0].s32[1], &src_vals, 16, "_mm512_storeu_si512",
+ __LINE__);
+
+ dst_vals[0].zmm = _mm512_setzero_ps();
+ dst_vals[1].zmm = _mm512_setzero_ps();
+ _mm512_storeu_ps(&dst_vals[0].s32[2], src.zmm);
+ check_equal_nd(&dst_vals[0].s32[2], &src_vals, 16, "_mm512_storeu_pd",
+ __LINE__);
+
+ dst_vals[0].zmm = _mm512_setzero_ps();
+ dst_vals[1].zmm = _mm512_setzero_ps();
+ _mm512_storeu_pd(&dst_vals[0].s32[4], src.zmmd);
+ check_equal_nd(&dst_vals[0].s32[4], &src_vals, 16, "_mm512_storeu_pd",
+ __LINE__);
+
+ /* Now the write-masked versions. */
+
+ dst_vals[0] = all_ones;
+ dst_vals[1] = all_ones;
+ _mm512_mask_storeu_epi8(&dst_vals[0].s8[3], k64, src.zmmi);
+ expected = all_ones;
+ for (i = 0; i < 64; i++) {
+ if (((__mmask64)1 << i) & k64) {
+ expected.s8[i] = src.s8[i];
+ }
+ }
+ check_equal_nd(&dst_vals[0].s8[3], &expected, 16, "_mm512_mask_storeu_epi8",
+ __LINE__);
+
+ dst_vals[0] = all_ones;
+ dst_vals[1] = all_ones;
+ _mm512_mask_storeu_epi16(&dst_vals[0].s16[3], k32, src.zmmi);
+ expected = all_ones;
+ for (i = 0; i < 32; i++) {
+ if (((__mmask32)1 << i) & k32) {
+ expected.s16[i] = src.s16[i];
+ }
+ }
+ check_equal_nd(&dst_vals[0].s16[3], &expected, 16, "_mm512_mask_storeu_epi16",
+ __LINE__);
+
+ dst_vals[0] = all_ones;
+ dst_vals[1] = all_ones;
+ _mm512_mask_storeu_epi32(&dst_vals[0].s32[1], k16, src.zmmi);
+ expected = all_ones;
+ for (i = 0; i < 16; i++) {
+ if ((1 << i) & k16) {
+ expected.s32[i] = src.s32[i];
+ }
+ }
+ check_equal_nd(&dst_vals[0].s32[1], &expected, 16, "_mm512_mask_storeu_epi32",
+ __LINE__);
+
+ k16 = 0xdcba + vol0;
+ dst_vals[0] = all_ones;
+ dst_vals[1] = all_ones;
+ _mm512_mask_storeu_ps(&dst_vals[0].s32[3], k16, src.zmm);
+ expected = all_ones;
+ for (i = 0; i < 16; i++) {
+ if ((1 << i) & k16) {
+ expected.s32[i] = src.s32[i];
+ }
+ }
+ check_equal_nd(&dst_vals[0].s32[3], &expected, 16, "_mm512_mask_storeu_ps",
+ __LINE__);
+
+ k8 = 0xbc;
+ dst_vals[0] = all_ones;
+ dst_vals[1] = all_ones;
+ _mm512_mask_storeu_pd(&dst_vals[0].s64[3], k8, src.zmmd);
+ expected = all_ones;
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k8) {
+ expected.s64[i] = src.s64[i];
+ }
+ }
+ check_equal_nd(&dst_vals[0].s64[3], &expected, 16, "_mm512_mask_storeu_pd",
+ __LINE__);
+
+ k8 = 0xcb + vol0;
+ dst_vals[0] = all_ones;
+ dst_vals[1] = all_ones;
+ _mm512_mask_storeu_epi64(&dst_vals[0].s64[1], k8, src.zmmi);
+ expected = all_ones;
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k8) {
+ expected.s64[i] = src.s64[i];
+ }
+ }
+ check_equal_nd(&dst_vals[0].s64[1], &expected, 16, "_mm512_mask_storeu_epi64",
+ __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+ init();
+
+ do_loadu();
+ do_storeu();
+
+ if (n_errs != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512BW/load_store.reference_output b/SingleSource/UnitTests/Vector/AVX512BW/load_store.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BW/load_store.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.c b/SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.c
new file mode 100644
index 00000000..b6ea9d05
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.c
@@ -0,0 +1,234 @@
+/*
+ * Exercise intrinsics for a instructions which set mask register
+ * by values in vector registers and set vector register value by
+ * values in mask register.
+ */
+
+#include "m512_test_util.h"
+
+__int64 calc_expected_mask_val(const char *valp, int el_size, int length) {
+ __int64 rval = 0;
+ int i;
+
+ for (i = 0; i < length; i++) {
+ if ((valp[el_size * i + (el_size - 1)] & 0x80) != 0) {
+ rval |= (1LL << i);
+ }
+ }
+
+ return rval;
+}
+
+char *calc_expected_vec_val(__mmask64 mask_val, int mask_size, int el_size,
+ char *buf) {
+ int i, j;
+
+ for (i = 0; i < mask_size * el_size; buf[i++] = 0)
+ ;
+
+ for (i = 0; i < mask_size; i++) {
+ if ((mask_val & (1LL << i)) != 0) {
+ for (j = 0; j < el_size; j++) {
+ buf[i * el_size + j] = 0xff;
+ }
+ }
+ }
+
+ return buf;
+}
+
+NOINLINE void check_mask16(__mmask16 res_mask, __mmask16 exp_mask,
+ const char *fname, const char *input) {
+ int i;
+
+ if (res_mask != exp_mask) {
+ printf("%s: 0x%x != 0x%x, input = ", fname, res_mask, exp_mask);
+ for (i = 0; i < 16; i++) {
+ printf("%02x ", input[i] & 0xff);
+ }
+ printf("\n");
+ n_errs++;
+ }
+}
+
+NOINLINE void check_xmm_arr(const __m128i xvar, char *buf, const char *fname,
+ __mmask64 input) {
+ int i;
+ char *p = (char *)&xvar;
+
+ if (memcmp((void *)p, (void *)buf, 16) != 0) {
+ printf("%s: 0x", fname);
+ for (i = 0; i < 16; i++) {
+ printf(" %02x", p[i] & 0xff);
+ }
+ printf(" != 0x");
+ for (i = 0; i < 16; i++) {
+ printf(" %02x", buf[i] & 0xff);
+ }
+ printf(", input = 0x%04x\n", (int)(input)&0xffff);
+ n_errs++;
+ }
+}
+
+NOINLINE void test_xmm(int shift, int mulp) {
+ ALIGNTO(16) char buf[16];
+ int i;
+ __m128i xvar;
+
+ for (i = 0; i < 16; i++) {
+ buf[i] = (i << shift) * mulp;
+ }
+
+ memcpy(&xvar, buf, 16);
+
+ check_mask16(_mm_movepi8_mask(xvar), calc_expected_mask_val(buf, 1, 16),
+ "_mm_movepi8_mask", buf);
+ check_mask16(_mm_movepi16_mask(xvar), calc_expected_mask_val(buf, 2, 8),
+ "_mm_movepi16_mask", buf);
+
+ check_xmm_arr(_mm_movm_epi8((__mmask16)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 16, 1, buf),
+ "_mm_movm_epi8", (__mmask16)shift * mulp);
+ check_xmm_arr(_mm_movm_epi16((__mmask16)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 8, 2, buf),
+ "_mm_movm_epi16", (__mmask16)shift * mulp);
+}
+
+NOINLINE void check_mask32(__mmask32 res_mask, __mmask32 exp_mask,
+ const char *fname, const char *input) {
+ int i;
+
+ if (res_mask != exp_mask) {
+ printf("%s: 0x%x != 0x%x, input = ", fname, res_mask, exp_mask);
+ for (i = 0; i < 32; i++) {
+ printf("%02x ", input[i] & 0xff);
+ }
+ printf("\n");
+ n_errs++;
+ exit(1);
+ }
+}
+
+NOINLINE void check_ymm_arr(const __m256i yvar, char *buf, const char *fname,
+ __mmask64 input) {
+ int i;
+ char *p = (char *)&yvar;
+
+ if (memcmp((void *)p, (void *)buf, 32) != 0) {
+ printf("%s: 0x", fname);
+ for (i = 0; i < 32; i++) {
+ printf(" %02x", p[i] & 0xff);
+ }
+ printf(" != 0x");
+ for (i = 0; i < 32; i++) {
+ printf(" %02x", buf[i] & 0xff);
+ }
+ printf(", input = 0x%04x\n", (int)(input));
+ n_errs++;
+ }
+}
+
+NOINLINE void test_ymm(int shift, int mulp) {
+ ALIGNTO(32) char buf[32];
+ int i;
+ __m256i yvar;
+
+ for (i = 0; i < 32; i++) {
+ buf[i] = (i << shift) * mulp;
+ }
+
+ memcpy(&yvar, buf, 32);
+
+ check_mask32(_mm256_movepi8_mask(yvar), calc_expected_mask_val(buf, 1, 32),
+ "_mm256_movepi8_mask", buf);
+ check_mask32(_mm256_movepi16_mask(yvar), calc_expected_mask_val(buf, 2, 16),
+ "_mm256_movepi16_mask", buf);
+
+ check_ymm_arr(_mm256_movm_epi8((__mmask32)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 32, 1, buf),
+ "_mm256_movm_epi8", (__mmask32)shift * mulp);
+ check_ymm_arr(_mm256_movm_epi16((__mmask32)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 16, 2, buf),
+ "_mm256_movm_epi16", (__mmask32)shift * mulp);
+}
+
+NOINLINE void check_mask64(__mmask64 res_mask, __mmask64 exp_mask,
+ const char *fname, const char *input) {
+ int i;
+
+ if (res_mask != exp_mask) {
+ printf("%s: 0x%llx != 0x%llx, input = ", fname, res_mask, exp_mask);
+ for (i = 0; i < 64; i++) {
+ printf("%02x ", input[i] & 0xff);
+ }
+ printf("\n");
+ n_errs++;
+ }
+}
+
+NOINLINE void check_zmm_arr(const __m512i zvar, char *buf, const char *fname,
+ __mmask64 input) {
+ int i;
+ char *p = (char *)&zvar;
+
+ if (memcmp((void *)p, (void *)buf, 64) != 0) {
+ printf("%s: 0x", fname);
+ for (i = 0; i < 64; i++) {
+ printf(" %02x", p[i] & 0xff);
+ }
+ printf(" != 0x");
+ for (i = 0; i < 64; i++) {
+ printf(" %02x", buf[i] & 0xff);
+ }
+ printf(", input = 0x%08llx\n", input);
+ n_errs++;
+ }
+}
+
+NOINLINE void test_zmm(int shift, int mulp) {
+ ALIGNTO(64) char buf[64];
+ int i;
+ __m512i zvar;
+
+ for (i = 0; i < 64; i++) {
+ buf[i] = (i << shift) * mulp;
+ }
+
+ memcpy(&zvar, buf, 64);
+
+ check_mask64(_mm512_movepi8_mask(zvar), calc_expected_mask_val(buf, 1, 64),
+ "_mm512_movepi8_mask", buf);
+ check_mask64(_mm512_movepi16_mask(zvar), calc_expected_mask_val(buf, 2, 32),
+ "_mm512_movepi16_mask", buf);
+
+ check_zmm_arr(_mm512_movm_epi8((__mmask64)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 64, 1, buf),
+ "_mm512_movm_epi8", (__mmask64)shift * mulp);
+ check_zmm_arr(_mm512_movm_epi16((__mmask64)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 32, 2, buf),
+ "_mm512_movm_epi16", (__mmask64)shift * mulp);
+}
+
+NOINLINE void test_all() {
+ int shift, mulp;
+
+ for (mulp = -1000; mulp < 1000; mulp += 10) {
+ for (shift = 0; shift < 64; shift++) {
+ test_xmm(shift, mulp);
+ test_ymm(shift, mulp);
+ test_zmm(shift, mulp);
+ }
+ }
+}
+
+int main(void) {
+ test_all();
+
+ if (n_errs != 0) {
+ printf("FAILED, n_errs = %d\n", n_errs);
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.reference_output b/SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BWVL/mask_set_bw.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.c b/SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.c
new file mode 100644
index 00000000..0b29243c
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.c
@@ -0,0 +1,208 @@
+
+/*
+ * Test load, copy and store intrinsics related to integer move instructions.
+ */
+
+#include "m512_test_util.h"
+
+V512 i8_src1;
+V512 i8_src2;
+V512 i16_src1;
+V512 i16_src2;
+
+void NOINLINE init() {
+ volatile int i;
+
+ for (i = 0; i < 64; i++) {
+ i8_src1.s8[i] = i;
+ i8_src2.s8[i] = (i & 1) ? i : -i;
+ }
+
+ for (i = 0; i < 32; i++) {
+ i16_src1.s16[i] = i;
+ i16_src2.s16[i] = (i & 1) ? i : -i;
+ }
+}
+
+/*
+ * Use "soft update" between tests to make compiler think src was updated.
+ * Prevents PRE'ing a load of src, thus allowing ciscization.
+ * Also prevents PRE'ing intrinsic operations, ensuring we
+ * execute the intended instructions.
+ */
+volatile int vol0 = 0;
+#define soft_v512_update(var) (var).xmmi[vol0] = (var).xmmi[vol0]
+
+#define BLANK
+
+#define GEN_MASK_I8_LOAD(oper) GEN_MASK_I8(oper, &)
+#define GEN_MASK_I8_COPY(oper) GEN_MASK_I8(oper, BLANK)
+
+#define GEN_MASK_I8(oper, addr_of) \
+ void NOINLINE do_##oper() { \
+ V512 xmm_res, ymm_res, zmm_res; \
+ __mmask64 k64 = 0xabcdeffe97febdca; \
+ __mmask32 k32 = (__mmask32)k64; \
+ __mmask16 k16 = (__mmask16)k64; \
+ \
+ /* Masked. */ \
+ \
+ zmm_res.zmmi = _mm512_setzero_epi32(); \
+ ymm_res = zmm_res; \
+ xmm_res = zmm_res; \
+ \
+ soft_v512_update(i8_src2); \
+ zmm_res.zmmi = \
+ _mm512_mask_##oper(i8_src1.zmmi, k64, addr_of i8_src2.zmmi); \
+ soft_v512_update(i8_src2); \
+ ymm_res.ymmi[0] = \
+ _mm256_mask_##oper(i8_src1.ymmi[0], k32, addr_of i8_src2.ymmi[0]); \
+ soft_v512_update(i8_src2); \
+ xmm_res.xmmi[0] = \
+ _mm_mask_##oper(i8_src1.xmmi[0], k16, addr_of i8_src2.xmmi[0]); \
+ \
+ check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_mask_" #oper, __LINE__); \
+ check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_mask_" #oper, __LINE__); \
+ \
+ /* Zero-masked. */ \
+ \
+ zmm_res.zmmi = _mm512_set1_epi32(1.0); \
+ ymm_res = zmm_res; \
+ xmm_res = zmm_res; \
+ \
+ soft_v512_update(i8_src1); \
+ zmm_res.zmmi = _mm512_maskz_##oper(k64, addr_of i8_src1.zmmi); \
+ soft_v512_update(i8_src1); \
+ ymm_res.ymmi[0] = _mm256_maskz_##oper(k32, addr_of i8_src1.ymmi[0]); \
+ soft_v512_update(i8_src1); \
+ xmm_res.xmmi[0] = _mm_maskz_##oper(k16, addr_of i8_src1.xmmi[0]); \
+ \
+ check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_maskz_" #oper, __LINE__); \
+ check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_maskz_" #oper, __LINE__); \
+ }
+
+#define GEN_MASK_I8_STORE(oper) \
+ void NOINLINE do_##oper() { \
+ V512 xmm_res, ymm_res, zmm_res; \
+ __mmask64 k64 = 0xabcdeffe97febdca; \
+ __mmask32 k32 = (__mmask32)k64; \
+ __mmask16 k16 = (__mmask16)k64; \
+ \
+ /* Masked. */ \
+ \
+ zmm_res = i16_src1; \
+ ymm_res = zmm_res; \
+ xmm_res = zmm_res; \
+ \
+ soft_v512_update(i8_src2); \
+ _mm512_mask_##oper(&zmm_res.zmmi, k64, i8_src2.zmmi); \
+ soft_v512_update(i8_src2); \
+ soft_v512_update(ymm_res); \
+ _mm256_mask_##oper(&ymm_res.ymmi[0], k32, i8_src2.ymmi[0]); \
+ soft_v512_update(i8_src2); \
+ soft_v512_update(xmm_res); \
+ _mm_mask_##oper(&xmm_res.xmmi[0], k16, i8_src2.xmmi[0]); \
+ \
+ check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_mask_" #oper, __LINE__); \
+ check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_mask_" #oper, __LINE__); \
+ }
+
+#define GEN_MASK_I16_LOAD(oper) GEN_MASK_I16(oper, &)
+#define GEN_MASK_I16_COPY(oper) GEN_MASK_I16(oper, BLANK)
+
+#define GEN_MASK_I16(oper, addr_of) \
+ void NOINLINE do_##oper() { \
+ V512 xmm_res, ymm_res, zmm_res; \
+ __mmask32 k32 = 0xcfe97dba; \
+ __mmask16 k16 = (__mmask16)k32; \
+ __mmask8 k8 = (__mmask8)k32; \
+ \
+ /* Masked. */ \
+ \
+ zmm_res.zmmi = _mm512_setzero_epi32(); \
+ ymm_res = zmm_res; \
+ xmm_res = zmm_res; \
+ \
+ soft_v512_update(i16_src2); \
+ zmm_res.zmmi = \
+ _mm512_mask_##oper(i16_src1.zmmi, k32, addr_of i16_src2.zmmi); \
+ soft_v512_update(i16_src2); \
+ ymm_res.ymmi[0] = \
+ _mm256_mask_##oper(i16_src1.ymmi[0], k16, addr_of i16_src2.ymmi[0]); \
+ soft_v512_update(i16_src2); \
+ xmm_res.xmmi[0] = \
+ _mm_mask_##oper(i16_src1.xmmi[0], k8, addr_of i16_src2.xmmi[0]); \
+ \
+ check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_mask_" #oper, __LINE__); \
+ check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_mask_" #oper, __LINE__); \
+ \
+ /* Zero-masked. */ \
+ \
+ zmm_res.zmmi = _mm512_set1_epi32(1.0); \
+ ymm_res = zmm_res; \
+ xmm_res = zmm_res; \
+ \
+ soft_v512_update(i16_src1); \
+ zmm_res.zmmi = _mm512_maskz_##oper(k32, addr_of i16_src1.zmmi); \
+ soft_v512_update(i16_src1); \
+ ymm_res.ymmi[0] = _mm256_maskz_##oper(k16, addr_of i16_src1.ymmi[0]); \
+ soft_v512_update(i16_src1); \
+ xmm_res.xmmi[0] = _mm_maskz_##oper(k8, addr_of i16_src1.xmmi[0]); \
+ \
+ check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_maskz_" #oper, __LINE__); \
+ check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_maskz_" #oper, __LINE__); \
+ }
+
+#define GEN_MASK_I16_STORE(oper) \
+ void NOINLINE do_##oper() { \
+ V512 xmm_res, ymm_res, zmm_res; \
+ __mmask32 k32 = 0xcfe97dba; \
+ __mmask16 k16 = (__mmask16)k32; \
+ __mmask8 k8 = (__mmask8)k32; \
+ \
+ /* Masked. */ \
+ \
+ zmm_res.zmmi = _mm512_setzero_epi32(); \
+ ymm_res = zmm_res; \
+ xmm_res = zmm_res; \
+ \
+ soft_v512_update(i16_src2); \
+ _mm512_mask_##oper(&zmm_res.zmmi, k32, i16_src2.zmmi); \
+ soft_v512_update(i16_src2); \
+ soft_v512_update(ymm_res); \
+ _mm256_mask_##oper(&ymm_res.ymmi[0], k16, i16_src2.ymmi[0]); \
+ soft_v512_update(i16_src2); \
+ soft_v512_update(xmm_res); \
+ _mm_mask_##oper(&xmm_res.xmmi[0], k8, i16_src2.xmmi[0]); \
+ \
+ check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_mask_" #oper, __LINE__); \
+ check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_mask_" #oper, __LINE__); \
+ }
+
+GEN_MASK_I8_LOAD(loadu_epi8)
+GEN_MASK_I8_COPY(mov_epi8)
+GEN_MASK_I8_STORE(storeu_epi8)
+
+GEN_MASK_I16_LOAD(loadu_epi16)
+GEN_MASK_I16_COPY(mov_epi16)
+GEN_MASK_I16_STORE(storeu_epi16)
+
+int main() {
+ init();
+
+ do_loadu_epi8();
+ do_mov_epi8();
+ do_storeu_epi8();
+
+ do_loadu_epi16();
+ do_mov_epi16();
+ do_storeu_epi16();
+
+ if (n_errs != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.reference_output b/SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BWVL/mov_xyz_int.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/sets.c b/SingleSource/UnitTests/Vector/AVX512BWVL/sets.c
new file mode 100644
index 00000000..2e42b7c7
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BWVL/sets.c
@@ -0,0 +1,316 @@
+/*
+ * Test the "set" intrinsics.
+ *
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm_mask_set1_epi*
+ * _mm_maskz_set1_epi*
+ * _mm256_mask_set1_epi*
+ * _mm256_maskz_set1_epi*
+ * _mm512_mask_set1_epi*
+ * _mm512_maskz_set1_epi*
+ * _mm512_set1_epi*
+ */
+
+#include "m512_test_util.h"
+
+volatile int vol0 = 0;
+
+/* Some scalars that can be ciscized. */
+
+unsigned char char6 = 6;
+unsigned short short7 = 7;
+int int11 = 11;
+__int64 int64_13 = 13;
+
+void NOINLINE invalidate_scalars() {
+ /* Make compiler think these variables could have an arbitrary value. */
+ char6 += vol0;
+ short7 += vol0;
+ int11 += vol0;
+ int64_13 += vol0;
+}
+
+void NOINLINE do_set1_epi8() {
+ V512 res, xres, yres;
+ V512 expected;
+ __mmask64 k = 0xffeebb97abcdffe9;
+ __mmask32 k32 = (__mmask32)k;
+ __mmask16 k16 = (__mmask16)k32;
+ int i;
+
+ res.zmmi = _mm512_set1_epi8(9);
+ expected.zmmi = _mm512_set1_epi32(0x09090909);
+ check_equal_nd(&res, &expected, 16, "_mm512_set1_epi8", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_set1_epi8(char6);
+ expected.zmmi = _mm512_set1_epi32(0x06060606);
+ check_equal_nd(&res, &expected, 16, "_mm512_set1_epi8 unknown", __LINE__);
+
+ invalidate_scalars();
+
+ /* Masked */
+ res.zmmi = _mm512_set1_epi32(-1);
+
+ xres.xmmi[0] = _mm_mask_set1_epi8(res.xmmi[0], k16, 14);
+ yres.ymmi[0] = _mm256_mask_set1_epi8(res.ymmi[0], k32, 14);
+ res.zmmi = _mm512_mask_set1_epi8(res.zmmi, k, 14);
+
+ expected.zmmi = _mm512_set1_epi32(0x0e0e0e0e);
+ for (i = 0; i < 64; i++) {
+ if ((k & ((__mmask64)1 << i)) == 0) {
+ expected.s8[i] = -1;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_set1_epi8", __LINE__);
+ check_equal_nd(&yres, &expected, 8, "_mm256_mask_set1_epi8", __LINE__);
+ check_equal_nd(&xres, &expected, 4, "_mm_mask_set1_epi8", __LINE__);
+
+ invalidate_scalars();
+ /* Zero masked */
+
+ res.zmmi = _mm512_maskz_set1_epi8(k, 19);
+ xres.xmmi[0] = _mm_maskz_set1_epi8(k16, 19);
+ yres.ymmi[0] = _mm256_maskz_set1_epi8(k32, 19);
+
+ expected.zmmi = _mm512_set1_epi32(0x13131313);
+ for (i = 0; i < 64; i++) {
+ if ((k & ((__mmask64)1 << i)) == 0) {
+ expected.s8[i] = 0;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi8", __LINE__);
+ check_equal_nd(&yres, &expected, 8, "_mm256_maskz_set1_epi8", __LINE__);
+ check_equal_nd(&xres, &expected, 4, "_mm_maskz_set1_epi8", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_maskz_set1_epi8(k, char6);
+ expected.zmmi = _mm512_set1_epi32(0x06060606);
+ for (i = 0; i < 64; i++) {
+ if ((k & ((__mmask64)1 << i)) == 0) {
+ expected.s8[i] = 0;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi8 unknown",
+ __LINE__);
+}
+
+void NOINLINE do_set1_epi16() {
+ V512 res, xres, yres;
+ V512 expected;
+ __mmask32 k = 0xabcdffe9;
+ __mmask16 k16 = (__mmask16)k;
+ __mmask8 k8 = (__mmask8)k16;
+ int i;
+
+ res.zmmi = _mm512_set1_epi16(9);
+ expected.zmmi = _mm512_set1_epi32((9 << 16) | 9);
+ check_equal_nd(&res, &expected, 16, "_mm512_set1_epi16", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_set1_epi16(short7);
+ expected.zmmi = _mm512_set1_epi32((7 << 16) | 7);
+ check_equal_nd(&res, &expected, 16, "_mm512_set1_epi16 unknown", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_set1_epi32(-1);
+
+ xres.xmmi[0] = _mm_mask_set1_epi16(res.xmmi[0], k8, 14);
+ yres.ymmi[0] = _mm256_mask_set1_epi16(res.ymmi[0], k16, 14);
+ res.zmmi = _mm512_mask_set1_epi16(res.zmmi, k, 14);
+
+ expected.zmmi = _mm512_set1_epi32((14 << 16) | 14);
+ for (i = 0; i < 32; i++) {
+ if ((k & (1 << i)) == 0) {
+ expected.s16[i] = -1;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_set1_epi16", __LINE__);
+ check_equal_nd(&yres, &expected, 8, "_mm256_mask_set1_epi16", __LINE__);
+ check_equal_nd(&xres, &expected, 4, "_mm_mask_set1_epi16", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_maskz_set1_epi16(k, 19);
+ xres.xmmi[0] = _mm_maskz_set1_epi16(k8, 19);
+ yres.ymmi[0] = _mm256_maskz_set1_epi16(k16, 19);
+
+ expected.zmmi = _mm512_set1_epi32((19 << 16) | 19);
+ for (i = 0; i < 32; i++) {
+ if ((k & (1 << i)) == 0) {
+ expected.s16[i] = 0;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi16", __LINE__);
+ check_equal_nd(&yres, &expected, 8, "_mm256_maskz_set1_epi16", __LINE__);
+ check_equal_nd(&xres, &expected, 4, "_mm_maskz_set1_epi16", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_maskz_set1_epi16(k, short7);
+ expected.zmmi = _mm512_set1_epi32((7 << 16) | 7);
+ for (i = 0; i < 32; i++) {
+ if ((k & (1 << i)) == 0) {
+ expected.s16[i] = 0;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi16 unknown",
+ __LINE__);
+}
+
+void NOINLINE do_set1_epi32() {
+ V512 res, xres, yres;
+ V512 expected;
+ __mmask16 k = 0xf7e6;
+ __mmask8 k8 = (__mmask8)k;
+ volatile int i;
+
+ res.zmmi = _mm512_set1_epi32(9);
+ for (i = 0; i < 16; i++) {
+ expected.s32[i] = 9;
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_set1_epi32", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_set1_epi32(int11);
+ for (i = 0; i < 16; i++) {
+ expected.s32[i] = 11;
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_set1_epi32 unknown", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_set1_epi32(-1);
+ xres.xmmi[0] = _mm_mask_set1_epi32(res.xmmi[0], k8, 14);
+ yres.ymmi[0] = _mm256_mask_set1_epi32(res.ymmi[0], k8, 14);
+ res.zmmi = _mm512_mask_set1_epi32(res.zmmi, k, 14);
+
+ for (i = 0; i < 16; i++) {
+ if ((k & (1 << i)) == 0) {
+ expected.s32[i] = -1;
+ } else {
+ expected.s32[i] = 14;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_set1_epi32", __LINE__);
+ check_equal_nd(&yres, &expected, 8, "_mm256_mask_set1_epi32", __LINE__);
+ check_equal_nd(&xres, &expected, 4, "_mm_mask_set1_epi32", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_maskz_set1_epi32(k, 19);
+ xres.xmmi[0] = _mm_maskz_set1_epi32(k8, 19);
+ yres.ymmi[0] = _mm256_maskz_set1_epi32(k8, 19);
+
+ for (i = 0; i < 16; i++) {
+ if ((k & (1 << i)) == 0) {
+ expected.s32[i] = 0;
+ } else {
+ expected.s32[i] = 19;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi32", __LINE__);
+ check_equal_nd(&yres, &expected, 8, "_mm256_maskz_set1_epi32", __LINE__);
+ check_equal_nd(&xres, &expected, 4, "_mm_maskz_set1_epi32", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_maskz_set1_epi32(k, int11);
+ for (i = 0; i < 16; i++) {
+ if ((k & (1 << i)) == 0) {
+ expected.s32[i] = 0;
+ } else {
+ expected.s32[i] = 11;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi32 unknown",
+ __LINE__);
+}
+
+void NOINLINE do_set1_epi64() {
+ V512 res, xres, yres;
+ V512 expected;
+ __mmask8 k = 0xe7;
+ volatile int i;
+
+ res.zmmi = _mm512_set1_epi64(9);
+ for (i = 0; i < 8; i++) {
+ expected.s64[i] = 9;
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_set1_epi64", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_set1_epi64(int64_13);
+ for (i = 0; i < 8; i++) {
+ expected.s64[i] = 13;
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_set1_epi64 unknown", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_set1_epi64(-1);
+ xres.xmmi[0] = _mm_mask_set1_epi64(res.xmmi[0], k, 14);
+ yres.ymmi[0] = _mm256_mask_set1_epi64(res.ymmi[0], k, 14);
+ res.zmmi = _mm512_mask_set1_epi64(res.zmmi, k, 14);
+ for (i = 0; i < 8; i++) {
+ if ((k & (1 << i)) == 0) {
+ expected.s64[i] = -1;
+ } else {
+ expected.s64[i] = 14;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_mask_set1_epi64", __LINE__);
+ check_equal_nd(&yres, &expected, 8, "_mm256_mask_set1_epi64", __LINE__);
+ check_equal_nd(&xres, &expected, 4, "_mm_mask_set1_epi64", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_maskz_set1_epi64(k, 19);
+ xres.xmmi[0] = _mm_maskz_set1_epi64(k, 19);
+ yres.ymmi[0] = _mm256_maskz_set1_epi64(k, 19);
+ for (i = 0; i < 8; i++) {
+ if ((k & (1 << i)) == 0) {
+ expected.s64[i] = 0;
+ } else {
+ expected.s64[i] = 19;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi64", __LINE__);
+ check_equal_nd(&yres, &expected, 8, "_mm256_maskz_set1_epi64", __LINE__);
+ check_equal_nd(&xres, &expected, 4, "_mm_maskz_set1_epi64", __LINE__);
+
+ invalidate_scalars();
+
+ res.zmmi = _mm512_maskz_set1_epi64(k, int64_13);
+ for (i = 0; i < 8; i++) {
+ if ((k & (1 << i)) == 0) {
+ expected.s64[i] = 0;
+ } else {
+ expected.s64[i] = 13;
+ }
+ }
+ check_equal_nd(&res, &expected, 16, "_mm512_maskz_set1_epi64 unknown",
+ __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+ do_set1_epi8();
+ do_set1_epi16();
+ do_set1_epi32();
+ do_set1_epi64();
+
+ if (n_errs) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/sets.reference_output b/SingleSource/UnitTests/Vector/AVX512BWVL/sets.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BWVL/sets.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.c b/SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.c
new file mode 100644
index 00000000..68954bc3
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.c
@@ -0,0 +1,1253 @@
+/*
+ * Test intrinsics related to integer down-converting instructions
+ * like vpmovdb, where the source values are in an __m128i value.
+ *
+ * This test was created to check the correctness
+ * of the following AVX512 intrinsics support:
+ *_mm_cvtepi*
+ *_mm_cvtsepi*
+ *_mm_cvtusepi*
+ *_mm_mask_cvtepi*
+ *_mm_mask_cvtsepi*
+ *_mm_mask_cvtusepi*
+ *_mm_maskz_cvtepi*
+ *_mm_maskz_cvtsepi*
+ *_mm_maskz_cvtusepi*
+ */
+
+#include "m512_test_util.h"
+#include <limits.h>
+
+volatile int vol0 = 0;
+
+#define soft_src_update(var) var.xmmi[vol0] = var.xmmi[vol0]
+
+V512 i8;
+V512 i16;
+V512 i16_mix;
+V512 i16_big;
+V512 i32;
+V512 i32_mix;
+V512 i32_big;
+V512 i64;
+V512 i64_mix;
+
+void NOINLINE init() {
+ volatile int i;
+
+ for (i = 0; i < 64; i++) {
+ i8.s8[i] = i;
+ }
+
+ for (i = 0; i < 32; i++) {
+ i16.s16[i] = i;
+ i16_mix.s16[i] = (i & 1) ? i : -i;
+ i16_big.s16[i] = 1000 * (i + 1);
+ if ((i & 1) != 0) {
+ i16_big.s16[i] = -i16_big.s16[i];
+ }
+ }
+
+ for (i = 0; i < 16; i++) {
+ i32.s32[i] = i;
+ i32_mix.s32[i] = (i & 1) ? i : -i;
+ i32_big.s32[i] = 1000 * (i + 1);
+ if ((i & 1) != 0) {
+ i32_big.s32[i] = -i32_big.s32[i];
+ }
+ }
+
+ for (i = 0; i < 8; i++) {
+ i64.s64[i] = i;
+ i64_mix.s64[i] = (i & 1) ? i : -i;
+ }
+}
+
+/* Saturation utility functions for emulation. */
+
+static signed char NOINLINE sat16_8(short s) {
+ return (s < -128) ? -128 : ((s > 127) ? 127 : s);
+}
+
+static unsigned char NOINLINE usat16_8(unsigned short s) {
+ return (s > 255) ? 255 : s;
+}
+
+static signed char NOINLINE sat32_8(int s) {
+ return (s < -128) ? -128 : ((s > 127) ? 127 : s);
+}
+
+static unsigned char usat32_8(unsigned int s) { return (s > 255) ? 255 : s; }
+
+static short NOINLINE sat32_16(int s) {
+ return (s < (int)0xffff8000) ? (int)0xffff8000
+ : ((s > (int)0x7fff) ? (int)0x7fff : s);
+}
+
+static unsigned short NOINLINE usat32_16(unsigned int s) {
+ return (s > (unsigned int)0xffff) ? (unsigned int)0xffff : s;
+}
+
+static signed char NOINLINE sat64_8(__int64 s) {
+ return (s < -128) ? -128 : ((s > 127) ? 127 : s);
+}
+
+static unsigned char NOINLINE usat64_8(unsigned __int64 s) {
+ return (s > 255) ? 255 : s;
+}
+
+static short NOINLINE sat64_16(__int64 s) {
+ return (s < SHRT_MIN) ? SHRT_MIN : ((s > SHRT_MAX) ? SHRT_MAX : s);
+}
+
+static unsigned short NOINLINE usat64_16(unsigned __int64 s) {
+ return (s > USHRT_MAX) ? USHRT_MAX : s;
+}
+
+static int NOINLINE sat64_32(__int64 s) {
+ return (s < INT_MIN) ? INT_MIN : ((s > INT_MAX) ? INT_MAX : s);
+}
+
+static unsigned int NOINLINE usat64_32(unsigned __int64 s) {
+ return (s > UINT_MAX) ? UINT_MAX : s;
+}
+
+void NOINLINE do_pmovwb() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xdb;
+
+ res.xmmi[0] = _mm_cvtepi16_epi8(i16.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 8; i++) {
+ expected.s8[i] = i16.s16[i];
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtepi16_epi8", __LINE__);
+
+ soft_src_update(i16);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtepi16_epi8(res.xmmi[1], k, i16.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = i16.s16[i];
+ } else {
+ expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi16_epi8", __LINE__);
+
+ soft_src_update(i16);
+ res.xmmi[0] = _mm_maskz_cvtepi16_epi8(k, i16.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = i16.s16[i];
+ } else {
+ expected.s8[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtepi16_epi8", __LINE__);
+
+ soft_src_update(i16);
+ res.xmmi[vol0] = i16_big.xmmi[0];
+ _mm_mask_cvtepi16_storeu_epi8(&res.xmmi[0], k, i16.xmmi[0]);
+ expected.xmmi[0] = i16_big.xmmi[0];
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = i16.s16[i];
+ }
+ }
+
+ check_equal_nd(&res, &expected, 2, "_mm_mask_cvtepi16_storeu_epi8", __LINE__);
+}
+
+void NOINLINE do_pmovswb() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xdc;
+
+ res.xmmi[0] = _mm_cvtsepi16_epi8(i16_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 8; i++) {
+ expected.s8[i] = sat16_8(i16_mix.s16[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtsepi16_epi8", __LINE__);
+
+ soft_src_update(i16_mix);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtsepi16_epi8(res.xmmi[1], k, i16_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = sat16_8(i16_mix.s16[i]);
+ } else {
+ expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi16_epi8", __LINE__);
+
+ soft_src_update(i16_mix);
+ res.xmmi[0] = _mm_maskz_cvtsepi16_epi8(k, i16_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = sat16_8(i16_mix.s16[i]);
+ } else {
+ expected.s8[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi16_epi8", __LINE__);
+
+ soft_src_update(i16_mix);
+ res.xmmi[vol0] = i16_big.xmmi[0];
+ _mm_mask_cvtsepi16_storeu_epi8(&res.xmmi[0], k, i16_mix.xmmi[0]);
+ expected.xmmi[0] = i16_big.xmmi[0];
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = sat16_8(i16_mix.s16[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 2, "_mm_mask_cvtsepi16_storeu_epi8",
+ __LINE__);
+}
+
+void NOINLINE do_pmovuswb() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xfd;
+
+ res.xmmi[0] = _mm_cvtusepi16_epi8(i16_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 8; i++) {
+ expected.s8[i] = usat16_8(i16_mix.u16[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtusepi16_epi8", __LINE__);
+
+ soft_src_update(i16_mix);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtusepi16_epi8(res.xmmi[1], k, i16_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = usat16_8(i16_mix.u16[i]);
+ } else {
+ expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi16_epi8", __LINE__);
+
+ soft_src_update(i16_mix);
+ res.xmmi[0] = _mm_maskz_cvtusepi16_epi8(k, i16_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = usat16_8(i16_mix.u16[i]);
+ } else {
+ expected.s8[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi16_epi8", __LINE__);
+
+ soft_src_update(i16_mix);
+ res.xmmi[vol0] = i16_big.xmmi[0];
+ _mm_mask_cvtusepi16_storeu_epi8(&res.xmmi[0], k, i16_mix.xmmi[0]);
+ expected.xmmi[0] = i16_big.xmmi[0];
+ for (i = 0; i < 8; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = usat16_8(i16_mix.u16[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 2, "_mm_mask_cvtusepi16_storeu_epi8",
+ __LINE__);
+}
+
+void NOINLINE do_pmovdb() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xab;
+
+ res.xmmi[0] = _mm_cvtepi32_epi8(i32.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 4; i++) {
+ expected.s8[i] = i32.s32[i];
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtepi32_epi8", __LINE__);
+
+ soft_src_update(i32);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtepi32_epi8(res.xmmi[1], k, i32.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = i32.s32[i];
+ } else {
+ expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi32_epi8", __LINE__);
+
+ soft_src_update(i32);
+ res.xmmi[0] = _mm_maskz_cvtepi32_epi8(k, i32.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = i32.s32[i];
+ } else {
+ expected.s8[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtepi32_epi8", __LINE__);
+
+ soft_src_update(i32);
+ res.xmmi[vol0] = i32_big.xmmi[0];
+ _mm_mask_cvtepi32_storeu_epi8(&res.xmmi[0], k, i32.xmmi[0]);
+ expected.xmmi[0] = i32_big.xmmi[0];
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = i32.s32[i];
+ }
+ }
+
+ check_equal_nd(&res, &expected, 1, "_mm_mask_cvtepi32_storeu_epi8", __LINE__);
+}
+
+void NOINLINE do_pmovsdb() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xdb;
+
+ res.xmmi[0] = _mm_cvtsepi32_epi8(i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 4; i++) {
+ expected.s8[i] = sat32_8(i32_mix.s32[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtsepi32_epi8", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtsepi32_epi8(res.xmmi[1], k, i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = sat32_8(i32_mix.s32[i]);
+ } else {
+ expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi32_epi8", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[0] = _mm_maskz_cvtsepi32_epi8(k, i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = sat32_8(i32_mix.s32[i]);
+ } else {
+ expected.s8[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi32_epi8", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[vol0] = i32_big.xmmi[0];
+ _mm_mask_cvtsepi32_storeu_epi8(&res.xmmi[0], k, i32_mix.xmmi[0]);
+ expected.xmmi[0] = i32_big.xmmi[0];
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = sat32_8(i32_mix.s32[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 1, "_mm_mask_cvtsepi32_storeu_epi8",
+ __LINE__);
+}
+
+void NOINLINE do_pmovusdb() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xfd;
+
+ res.xmmi[0] = _mm_cvtusepi32_epi8(i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 4; i++) {
+ expected.s8[i] = usat32_8(i32_mix.u32[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtusepi32_epi8", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtusepi32_epi8(res.xmmi[1], k, i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = usat32_8(i32_mix.u32[i]);
+ } else {
+ expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi32_epi8", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[0] = _mm_maskz_cvtusepi32_epi8(k, i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = usat32_8(i32_mix.u32[i]);
+ } else {
+ expected.s8[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi32_epi8", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[vol0] = i32_big.xmmi[0];
+ _mm_mask_cvtusepi32_storeu_epi8(&res.xmmi[0], k, i32_mix.xmmi[0]);
+ expected.xmmi[0] = i32_big.xmmi[0];
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = usat32_8(i32_mix.u32[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 1, "_mm_mask_cvtusepi32_storeu_epi8",
+ __LINE__);
+}
+
+void NOINLINE do_pmovdw() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xcd;
+
+ res.xmmi[0] = _mm_cvtepi32_epi16(i32.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 4; i++) {
+ expected.s16[i] = i32.s32[i];
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtepi32_epi16", __LINE__);
+
+ soft_src_update(i32);
+ res.xmmi[1] = i8.xmmi[0];
+ res.xmmi[0] = i8.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtepi32_epi16(res.xmmi[1], k, i32.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = i32.s32[i];
+ } else {
+ expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi32_epi16", __LINE__);
+
+ soft_src_update(i32);
+ res.xmmi[0] = _mm_maskz_cvtepi32_epi16(k, i32.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = i32.s32[i];
+ } else {
+ expected.s16[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtepi32_epi16", __LINE__);
+
+ soft_src_update(i32);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtepi32_storeu_epi16(&res.xmmi[0], k, i32.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = i32.s32[i];
+ }
+ }
+
+ check_equal_nd(&res, &expected, 2, "_mm_mask_cvtepi32_storeu_epi16",
+ __LINE__);
+}
+
+void NOINLINE do_pmovsdw() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xfe;
+
+ res.xmmi[0] = _mm_cvtsepi32_epi16(i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 4; i++) {
+ expected.s16[i] = sat32_16(i32_mix.s32[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtsepi32_epi16", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtsepi32_epi16(res.xmmi[1], k, i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = sat32_16(i32_mix.s32[i]);
+ } else {
+ expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi32_epi16", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[0] = _mm_maskz_cvtsepi32_epi16(k, i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = sat32_16(i32_mix.s32[i]);
+ } else {
+ expected.s16[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi32_epi16", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtsepi32_storeu_epi16(&res.xmmi[0], k, i32_mix.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = sat32_16(i32_mix.s32[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 2, "_mm_mask_cvtsepi32_storeu_epi16",
+ __LINE__);
+}
+
+void NOINLINE do_pmovusdw() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xfe;
+
+ res.xmmi[0] = _mm_cvtusepi32_epi16(i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 4; i++) {
+ expected.u16[i] = usat32_16(i32_mix.u32[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtusepi32_epi16", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtusepi32_epi16(res.xmmi[1], k, i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.u16[i] = usat32_16(i32_mix.u32[i]);
+ } else {
+ expected.u16[i] = res.u16[8 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi32_epi16", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[0] = _mm_maskz_cvtusepi32_epi16(k, i32_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.u16[i] = usat32_16(i32_mix.u32[i]);
+ } else {
+ expected.u16[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi32_epi16", __LINE__);
+
+ soft_src_update(i32_mix);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtusepi32_storeu_epi16(&res.xmmi[0], k, i32_mix.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 4; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = usat32_16(i32_mix.s32[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 2, "_mm_mask_cvtusepi32_storeu_epi16",
+ __LINE__);
+}
+
+void NOINLINE do_pmovqb() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0x76;
+
+ res.xmmi[0] = _mm_cvtepi64_epi8(i64.xmmi[0]);
+
+ expected.u64[0] = 0;
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ expected.s8[i] = i64.s64[i];
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtepi64_epi8", __LINE__);
+
+ /*
+ * Exercise ciscization.
+ */
+
+ _mm_store_sd(&res.f64[2], _mm_castsi128_pd(_mm_cvtepi64_epi8(i64.xmmi[0])));
+ check_equal_nd(&res.f64[2], &expected, 2, "_mm_cvtepi64_epi8 ciscized",
+ __LINE__);
+
+ soft_src_update(i64);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtepi64_epi8(res.xmmi[1], k, i64.xmmi[0]);
+
+ expected.u64[0] = 0;
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = i64.s64[i];
+ } else {
+ expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi8", __LINE__);
+
+ soft_src_update(i64);
+ res.xmmi[0] = _mm_maskz_cvtepi64_epi8(k, i64.xmmi[0]);
+
+ expected.u64[0] = 0;
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = i64.s64[i];
+ } else {
+ expected.s8[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtepi64_epi8", __LINE__);
+
+ soft_src_update(i64);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtepi64_storeu_epi8(&res.xmmi[0], k, i64.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = i64.s64[i];
+ }
+ }
+
+ check_equal_nd(&res, &expected, 1, "_mm_mask_cvtepi64_storeu_epi8", __LINE__);
+}
+
+void NOINLINE do_pmovsqb() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0x67;
+
+ res.xmmi[0] = _mm_cvtsepi64_epi8(i64_mix.xmmi[0]);
+
+ expected.u64[0] = 0;
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ expected.s8[i] = sat64_8(i64_mix.s64[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtsepi64_epi8", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtsepi64_epi8(res.xmmi[1], k, i64_mix.xmmi[0]);
+
+ expected.u64[0] = 0;
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = sat64_8(i64_mix.s64[i]);
+ } else {
+ expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi64_epi8", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[0] = _mm_maskz_cvtsepi64_epi8(k, i64_mix.xmmi[0]);
+
+ expected.u64[0] = 0;
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = sat64_8(i64_mix.s64[i]);
+ } else {
+ expected.s8[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi64_epi8", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtsepi64_storeu_epi8(&res.xmmi[0], k, i64_mix.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = sat64_8(i64_mix.s64[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 1, "_mm_mask_cvtsepi64_storeu_epi8",
+ __LINE__);
+}
+
+void NOINLINE do_pmovusqb() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0x67;
+
+ res.xmmi[0] = _mm_cvtusepi64_epi8(i64_mix.xmmi[0]);
+
+ expected.u64[0] = 0;
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ expected.u8[i] = (i64_mix.u64[i] > 255) ? 255 : i64_mix.u64[i];
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtusepi64_epi8", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[1] = i16.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtusepi64_epi8(res.xmmi[1], k, i64_mix.xmmi[0]);
+
+ expected.u64[0] = 0;
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = usat64_8(i64_mix.u64[i]);
+ } else {
+ expected.s8[i] = res.s8[16 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi64_epi8", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[0] = _mm_maskz_cvtusepi64_epi8(k, i64_mix.xmmi[0]);
+
+ expected.u64[0] = 0;
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = usat64_8(i64_mix.u64[i]);
+ } else {
+ expected.s8[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi64_epi8", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtusepi64_storeu_epi8(&res.xmmi[0], k, i64_mix.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s8[i] = usat64_8(i64_mix.u64[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 1, "_mm_mask_cvtusepi64_storeu_epi8",
+ __LINE__);
+}
+
+void NOINLINE do_pmovqw() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xe9;
+
+ res.xmmi[0] = _mm_cvtepi64_epi16(i64.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 2; i++) {
+ expected.s16[i] = i64.s64[i];
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtepi64_epi16", __LINE__);
+
+ soft_src_update(i64);
+ res.xmmi[1] = i8.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtepi64_epi16(res.xmmi[1], k, i64.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = i64.s64[i];
+ } else {
+ expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi16", __LINE__);
+
+ soft_src_update(i64);
+ res.xmmi[0] = _mm_maskz_cvtepi64_epi16(k, i64.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = i64.s64[i];
+ } else {
+ expected.s16[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi16", __LINE__);
+
+ soft_src_update(i64);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtepi64_storeu_epi16(&res.xmmi[0], k, i64.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = i64.s64[i];
+ }
+ }
+
+ check_equal_nd(&res, &expected, 1, "_mm_mask_cvtepi64_storeu_epi16",
+ __LINE__);
+}
+
+void NOINLINE do_pmovsqw() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xe9;
+
+ res.xmmi[0] = _mm_cvtsepi64_epi16(i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 2; i++) {
+ expected.s16[i] = sat64_16(i64_mix.s64[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtsepi64_epi16", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[1] = i8.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtsepi64_epi16(res.xmmi[1], k, i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = sat64_16(i64_mix.s64[i]);
+ } else {
+ expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi64_epi16", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[0] = _mm_maskz_cvtsepi64_epi16(k, i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = sat64_16(i64_mix.s64[i]);
+ } else {
+ expected.s16[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtsepi64_epi16", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtsepi64_storeu_epi16(&res.xmmi[0], k, i64_mix.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = sat64_16(i64_mix.s64[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 1, "_mm_mask_cvtsepi64_storeu_epi16",
+ __LINE__);
+}
+
+void NOINLINE do_pmovusqw() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xe9;
+ unsigned __int64 r;
+
+ res.xmmi[0] = _mm_cvtusepi64_epi16(i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 2; i++) {
+ expected.s16[i] = usat64_16(i64_mix.u64[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtusepi64_epi16", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[1] = i8.xmmi[0];
+ res.xmmi[0] = i16.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtusepi64_epi16(res.xmmi[1], k, i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = usat64_16(i64_mix.u64[i]);
+ } else {
+ expected.s16[i] = res.s16[8 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi64_epi16", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[0] = _mm_maskz_cvtusepi64_epi16(k, i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ expected.u64[0] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = usat64_16(i64_mix.u64[i]);
+ } else {
+ expected.s16[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi64_epi16", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtusepi64_storeu_epi16(&res.xmmi[0], k, i64_mix.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s16[i] = usat64_16(i64_mix.u64[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 1, "_mm_mask_cvtusepi64_storeu_epi16",
+ __LINE__);
+}
+
+void NOINLINE do_pmovqd() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xcf;
+
+ res.xmmi[0] = _mm_cvtepi64_epi32(i64.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ expected.s32[i] = i64.s64[i];
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtepi64_epi32", __LINE__);
+
+ soft_src_update(i64);
+ res.xmmi[1] = i8.xmmi[0];
+ res.xmmi[0] = i8.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtepi64_epi32(res.xmmi[1], k, i64.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s32[i] = i64.s64[i];
+ } else {
+ expected.s32[i] = res.s32[4 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi32", __LINE__);
+
+ soft_src_update(i64);
+ res.xmmi[0] = _mm_maskz_cvtepi64_epi32(k, i64.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s32[i] = i64.s64[i];
+ } else {
+ expected.s32[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtepi64_epi32", __LINE__);
+
+ soft_src_update(i64);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtepi64_storeu_epi32(&res.xmmi[0], k, i64.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s32[i] = i64.s64[i];
+ }
+ }
+
+ check_equal_nd(&res, &expected, 2, "_mm_mask_cvtepi64_storeu_epi32",
+ __LINE__);
+}
+
+void NOINLINE do_pmovsqd() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xcf;
+
+ res.xmmi[0] = _mm_cvtsepi64_epi32(i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ expected.s32[i] = sat64_32(i64_mix.s64[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtsepi64_epi32", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[1] = i8.xmmi[0];
+ res.xmmi[0] = i8.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtsepi64_epi32(res.xmmi[1], k, i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s32[i] = sat64_32(i64_mix.s64[i]);
+ } else {
+ expected.s32[i] = res.s32[4 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi64_epi32", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[0] = _mm_maskz_cvtsepi64_epi32(k, i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s32[i] = sat64_32(i64_mix.s64[i]);
+ } else {
+ expected.s32[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtsepi64_epi32", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtsepi64_storeu_epi32(&res.xmmi[0], k, i64_mix.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s32[i] = sat64_32(i64_mix.s64[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 2, "_mm_mask_cvtsepi64_storeu_epi32",
+ __LINE__);
+}
+
+void NOINLINE do_pmovusqd() {
+ V512 res;
+ V512 expected;
+ volatile int i;
+ __mmask8 k = 0xcf;
+
+ res.xmmi[0] = _mm_cvtusepi64_epi32(i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ expected.u32[i] = usat64_32(i64_mix.u64[i]);
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_cvtusepi64_epi32", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[1] = i8.xmmi[0];
+ res.xmmi[0] = i8.xmmi[1];
+ res.xmmi[0] = _mm_mask_cvtusepi64_epi32(res.xmmi[1], k, i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.u32[i] = usat64_32(i64_mix.u64[i]);
+ } else {
+ expected.s32[i] = res.s32[4 + i]; // From res.xmmi[1].
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_mask_cvtusepi64_epi32", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[0] = _mm_maskz_cvtusepi64_epi32(k, i64_mix.xmmi[0]);
+
+ expected.u64[1] = 0;
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.u32[i] = usat64_32(i64_mix.u64[i]);
+ } else {
+ expected.s32[i] = 0;
+ }
+ }
+
+ check_equal_nd(&res, &expected, 4, "_mm_maskz_cvtusepi64_epi32", __LINE__);
+
+ soft_src_update(i64_mix);
+ res.xmmi[vol0] = i8.xmmi[0];
+ _mm_mask_cvtusepi64_storeu_epi32(&res.xmmi[0], k, i64_mix.xmmi[0]);
+ expected.xmmi[0] = i8.xmmi[0];
+ for (i = 0; i < 2; i++) {
+ if ((1 << i) & k) {
+ expected.s32[i] = usat64_32(i64_mix.u64[i]);
+ }
+ }
+
+ check_equal_nd(&res, &expected, 2, "_mm_mask_cvtusepi64_storeu_epi32",
+ __LINE__);
+}
+
+int main(int argc, char *argv[]) {
+ init();
+
+ do_pmovwb();
+ do_pmovswb();
+ do_pmovuswb();
+
+ do_pmovdb();
+ do_pmovsdb();
+ do_pmovusdb();
+
+ do_pmovdw();
+ do_pmovsdw();
+ do_pmovusdw();
+
+ do_pmovqb();
+ do_pmovsqb();
+ do_pmovusqb();
+
+ do_pmovqw();
+ do_pmovsqw();
+ do_pmovusqw();
+
+ do_pmovqd();
+ do_pmovsqd();
+ do_pmovusqd();
+
+ if (n_errs != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.reference_output b/SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512BWVL/vpmovdown_xmm.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512DQVL/CMakeLists.txt b/SingleSource/UnitTests/Vector/AVX512DQVL/CMakeLists.txt
new file mode 100644
index 00000000..6afafe21
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512DQVL/CMakeLists.txt
@@ -0,0 +1,5 @@
+list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR})
+list(APPEND LDFLAGS -lm)
+list(APPEND CFLAGS "-march=${X86CPU_ARCH}")
+list(APPEND CFLAGS -fms-extensions)
+llvm_singlesource(PREFIX "Vector-AVX512DQVL-")
diff --git a/SingleSource/UnitTests/Vector/AVX512DQVL/Makefile b/SingleSource/UnitTests/Vector/AVX512DQVL/Makefile
new file mode 100644
index 00000000..be476f26
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512DQVL/Makefile
@@ -0,0 +1,11 @@
+# SingleSource/UnitTests/Vector/AVX512DQVL/Makefile
+
+DIRS =
+LEVEL = ../../../..
+CFLAGS += -fms-extensions -march=native -mavx512dq -mavx512vl -I${SourceDir}/..
+LDFLAGS += -lm
+
+include $(LEVEL)/SingleSource/Makefile.singlesrc
+
+TARGET_FLAGS += -march=native -mavx512dq -mavx512vl
+LCCFLAGS += -march=native -mavx512dq -mavx512vl
diff --git a/SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.c b/SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.c
new file mode 100644
index 00000000..3a675ec1
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.c
@@ -0,0 +1,234 @@
+/*
+ * Exercise intrinsics for a instructions which set mask register
+ * by values in vector registers and set vector register value by
+ * values in mask register.
+ */
+
+#include "m512_test_util.h"
+
+__int64 calc_expected_mask_val(const char *valp, int el_size, int length) {
+ __int64 rval = 0;
+ int i;
+
+ for (i = 0; i < length; i++) {
+ if ((valp[el_size * i + (el_size - 1)] & 0x80) != 0) {
+ rval |= (1LL << i);
+ }
+ }
+
+ return rval;
+}
+
+char *calc_expected_vec_val(__mmask64 mask_val, int mask_size, int el_size,
+ char *buf) {
+ int i, j;
+
+ for (i = 0; i < mask_size * el_size; buf[i++] = 0)
+ ;
+
+ for (i = 0; i < mask_size; i++) {
+ if ((mask_val & (1LL << i)) != 0) {
+ for (j = 0; j < el_size; j++) {
+ buf[i * el_size + j] = 0xff;
+ }
+ }
+ }
+
+ return buf;
+}
+
+NOINLINE void check_mask16(__mmask16 res_mask, __mmask16 exp_mask,
+ const char *fname, const char *input) {
+ int i;
+
+ if (res_mask != exp_mask) {
+ printf("%s: 0x%x != 0x%x, input = ", fname, res_mask, exp_mask);
+ for (i = 0; i < 16; i++) {
+ printf("%02x ", input[i] & 0xff);
+ }
+ printf("\n");
+ n_errs++;
+ }
+}
+
+NOINLINE void check_xmm_arr(const __m128i xvar, char *buf, const char *fname,
+ __mmask64 input) {
+ int i;
+ char *p = (char *)&xvar;
+
+ if (memcmp((void *)p, (void *)buf, 16) != 0) {
+ printf("%s: 0x", fname);
+ for (i = 0; i < 16; i++) {
+ printf(" %02x", p[i] & 0xff);
+ }
+ printf(" != 0x");
+ for (i = 0; i < 16; i++) {
+ printf(" %02x", buf[i] & 0xff);
+ }
+ printf(", input = 0x%04x\n", (int)(input)&0xffff);
+ n_errs++;
+ }
+}
+
+NOINLINE void test_xmm(int shift, int mulp) {
+ ALIGNTO(16) char buf[16];
+ int i;
+ __m128i xvar;
+
+ for (i = 0; i < 16; i++) {
+ buf[i] = (i << shift) * mulp;
+ }
+
+ memcpy(&xvar, buf, 16);
+
+ check_mask16(_mm_movepi32_mask(xvar), calc_expected_mask_val(buf, 4, 4),
+ "_mm_movepi32_mask", buf);
+ check_mask16(_mm_movepi64_mask(xvar), calc_expected_mask_val(buf, 8, 2),
+ "_mm_movepi64_mask", buf);
+
+ check_xmm_arr(_mm_movm_epi32((__mmask16)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 4, 4, buf),
+ "_mm_movm_epi32", (__mmask16)shift * mulp);
+ check_xmm_arr(_mm_movm_epi64((__mmask16)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 2, 8, buf),
+ "_mm_movm_epi64", (__mmask16)shift * mulp);
+}
+
+NOINLINE void check_mask32(__mmask32 res_mask, __mmask32 exp_mask,
+ const char *fname, const char *input) {
+ int i;
+
+ if (res_mask != exp_mask) {
+ printf("%s: 0x%x != 0x%x, input = ", fname, res_mask, exp_mask);
+ for (i = 0; i < 32; i++) {
+ printf("%02x ", input[i] & 0xff);
+ }
+ printf("\n");
+ n_errs++;
+ exit(1);
+ }
+}
+
+NOINLINE void check_ymm_arr(const __m256i yvar, char *buf, const char *fname,
+ __mmask64 input) {
+ int i;
+ char *p = (char *)&yvar;
+
+ if (memcmp((void *)p, (void *)buf, 32) != 0) {
+ printf("%s: 0x", fname);
+ for (i = 0; i < 32; i++) {
+ printf(" %02x", p[i] & 0xff);
+ }
+ printf(" != 0x");
+ for (i = 0; i < 32; i++) {
+ printf(" %02x", buf[i] & 0xff);
+ }
+ printf(", input = 0x%04x\n", (int)(input));
+ n_errs++;
+ }
+}
+
+NOINLINE void test_ymm(int shift, int mulp) {
+ ALIGNTO(32) char buf[32];
+ int i;
+ __m256i yvar;
+
+ for (i = 0; i < 32; i++) {
+ buf[i] = (i << shift) * mulp;
+ }
+
+ memcpy(&yvar, buf, 32);
+
+ check_mask32(_mm256_movepi32_mask(yvar), calc_expected_mask_val(buf, 4, 8),
+ "_mm256_movepi32_mask", buf);
+ check_mask32(_mm256_movepi64_mask(yvar), calc_expected_mask_val(buf, 8, 4),
+ "_mm256_movepi64_mask", buf);
+
+ check_ymm_arr(_mm256_movm_epi32((__mmask32)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 8, 4, buf),
+ "_mm256_movm_epi32", (__mmask32)shift * mulp);
+ check_ymm_arr(_mm256_movm_epi64((__mmask32)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 4, 8, buf),
+ "_mm256_movm_epi64", (__mmask32)shift * mulp);
+}
+
+NOINLINE void check_mask64(__mmask64 res_mask, __mmask64 exp_mask,
+ const char *fname, const char *input) {
+ int i;
+
+ if (res_mask != exp_mask) {
+ printf("%s: 0x%llx != 0x%llx, input = ", fname, res_mask, exp_mask);
+ for (i = 0; i < 64; i++) {
+ printf("%02x ", input[i] & 0xff);
+ }
+ printf("\n");
+ n_errs++;
+ }
+}
+
+NOINLINE void check_zmm_arr(const __m512i zvar, char *buf, const char *fname,
+ __mmask64 input) {
+ int i;
+ char *p = (char *)&zvar;
+
+ if (memcmp((void *)p, (void *)buf, 64) != 0) {
+ printf("%s: 0x", fname);
+ for (i = 0; i < 64; i++) {
+ printf(" %02x", p[i] & 0xff);
+ }
+ printf(" != 0x");
+ for (i = 0; i < 64; i++) {
+ printf(" %02x", buf[i] & 0xff);
+ }
+ printf(", input = 0x%08llx\n", input);
+ n_errs++;
+ }
+}
+
+NOINLINE void test_zmm(int shift, int mulp) {
+ ALIGNTO(64) char buf[64];
+ int i;
+ __m512i zvar;
+
+ for (i = 0; i < 64; i++) {
+ buf[i] = (i << shift) * mulp;
+ }
+
+ memcpy(&zvar, buf, 64);
+
+ check_mask64(_mm512_movepi32_mask(zvar), calc_expected_mask_val(buf, 4, 16),
+ "_mm512_movepi32_mask", buf);
+ check_mask64(_mm512_movepi64_mask(zvar), calc_expected_mask_val(buf, 8, 8),
+ "_mm512_movepi64_mask", buf);
+
+ check_zmm_arr(_mm512_movm_epi32((__mmask64)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 16, 4, buf),
+ "_mm512_movm_epi32", (__mmask64)shift * mulp);
+ check_zmm_arr(_mm512_movm_epi64((__mmask64)shift * mulp),
+ calc_expected_vec_val(shift * mulp, 8, 8, buf),
+ "_mm512_movm_epi64", (__mmask64)shift * mulp);
+}
+
+NOINLINE void test_all() {
+ int shift, mulp;
+
+ for (mulp = -1000; mulp < 1000; mulp += 10) {
+ for (shift = 0; shift < 64; shift++) {
+ test_xmm(shift, mulp);
+ test_ymm(shift, mulp);
+ test_zmm(shift, mulp);
+ }
+ }
+}
+
+int main(void) {
+ test_all();
+
+ if (n_errs != 0) {
+ printf("FAILED, n_errs = %d\n", n_errs);
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.reference_output b/SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512DQVL/mask_set_dq.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.c b/SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.c
new file mode 100644
index 00000000..58edb6e7
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.c
@@ -0,0 +1,147 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_i32gather_epi64()
+ * _mm512_mask_i32gather_epi64()
+ * _mm512_i32gather_pd()
+ * _mm512_mask_i32gather_pd()
+ */
+
+#include <stdio.h>
+#include <x86intrin.h>
+
+#define NUM (256 * 256)
+#define SCALE 8
+
+double dst512_f[NUM];
+double dst_f[NUM];
+__int64 dst512_i[NUM];
+__int64 dst_i[NUM];
+double src_f[NUM];
+__int64 src_i[NUM];
+__int64 mask512[NUM / 8];
+int g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ g_index[i] = MIN(i * 17 & 0xce, NUM - 1);
+ src_f[g_index[i]] = src_i[g_index[i]] = i;
+
+ dst_i[i] = dst_f[i] = -i;
+ dst512_i[i] = -i;
+ dst512_f[i] = -i;
+
+ if (i % 8 == 0) {
+ mask512[i / 8] = (i * 31) & 0xff;
+ }
+ }
+}
+
+void do_mm512_mmask_i32gather_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m512i old_dst = _mm512_loadu_si512((const __m512i *)(dst_i + i));
+ __m512i gtr =
+ _mm512_mask_i32gather_epi64(old_dst, mask512[i / 8], ind, src_i, SCALE);
+ _mm512_storeu_si512((__m512i *)(dst512_i + i), gtr);
+ }
+}
+
+void do_mm512_mmask_i32gather_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m512d old_dst = _mm512_loadu_pd(dst_f + i);
+ __m512d gtr =
+ _mm512_mask_i32gather_pd(old_dst, mask512[i / 8], ind, src_f, SCALE);
+ _mm512_storeu_pd(dst512_f + i, gtr);
+ }
+}
+
+void do_mm512_i32gather_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m512i gtr = _mm512_i32gather_epi64(ind, src_i, SCALE);
+ _mm512_storeu_si512((__m512i *)(dst512_i + i), gtr);
+ }
+}
+
+void do_mm512_i32gather_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m512d gtr = _mm512_i32gather_pd(ind, src_f, SCALE);
+ _mm512_storeu_pd(dst512_f + i, gtr);
+ }
+}
+
+int checkm(int id, __int64 *res_dst, __int64 *pass_thru_vals, __int64 *mask,
+ __int64 *src, int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ __int64 kmask = mask[i / elems_in_vector];
+ __int64 kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ __int64 v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i];
+
+ if (v != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %I64d, actual %I64d\n", v, res_dst[i]);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int check(int id, __int64 *res_dst, __int64 *src, int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+
+ __int64 v = src[g_index[i]];
+
+ if (v != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %I64d, actual %I64d\n", v, res_dst[i]);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm512_mmask_i32gather_epi64();
+ error |= checkm(1, dst512_i, dst_i, mask512, src_i, 8);
+
+ do_mm512_mmask_i32gather_pd();
+ error |= checkm(2, (__int64 *)dst512_f, (__int64 *)dst_f, mask512,
+ (__int64 *)src_f, 8);
+
+ init_data();
+
+ do_mm512_i32gather_epi64();
+ error |= check(3, dst512_i, src_i, 8);
+
+ do_mm512_i32gather_pd();
+ error |= check(4, (__int64 *)dst512_f, (__int64 *)src_f, 8);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.reference_output b/SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/i32gather_64_m512.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.c b/SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.c
new file mode 100644
index 00000000..4a374970
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.c
@@ -0,0 +1,134 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_i32scatter_epi64()
+ * _mm512_mask_i32scatter_epi64()
+ * _mm512_i32scatter_pd()
+ * _mm512_mask_i32scatter_pd()
+ */
+
+#include <stdio.h>
+#include <x86intrin.h>
+
+#define NUM (256 * 256)
+#define SCALE 8
+
+double dst512_f[NUM], gold_dst512_f[NUM], full_gold_dst512_f[NUM];
+__int64 dst512_i[NUM], gold_dst512_i[NUM], full_gold_dst512_i[NUM];
+int mask512[NUM / 8];
+int full_mask512[NUM / 8];
+int g_index[NUM];
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ dst512_i[i] = -1;
+ dst512_f[i] = -1;
+
+ g_index[i] = i * 2;
+ if (g_index[i] >= NUM) {
+ g_index[i] = NUM - 1 - (i - NUM / 2) * 2;
+ }
+
+ if (i % 8 == 0) {
+ mask512[i / 8] = (i * 31) & 0xff;
+ full_mask512[i / 8] = 0xff;
+ }
+
+ if ((mask512[i / 8] >> (i % 8)) & 0x1) {
+ gold_dst512_i[g_index[i]] = gold_dst512_f[g_index[i]] = i;
+ } else {
+ gold_dst512_i[g_index[i]] = gold_dst512_f[g_index[i]] = -1;
+ }
+
+ full_gold_dst512_i[g_index[i]] = full_gold_dst512_f[g_index[i]] = i;
+ }
+}
+
+void do_mm512_mask_i32scatter_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m512i val =
+ _mm512_set_epi64(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm512_mask_i32scatter_epi64(dst512_i, mask512[i / 8], ind, val, SCALE);
+ }
+}
+
+void do_mm512_mask_i32scatter_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m512d val =
+ _mm512_set_pd(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm512_mask_i32scatter_pd(dst512_f, mask512[i / 8], ind, val, SCALE);
+ }
+}
+
+void do_mm512_i32scatter_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m512i val =
+ _mm512_set_epi64(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm512_i32scatter_epi64(dst512_i, ind, val, SCALE);
+ }
+}
+
+void do_mm512_i32scatter_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m512d val =
+ _mm512_set_pd(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm512_i32scatter_pd(dst512_f, ind, val, SCALE);
+ }
+}
+
+int check(int id, __int64 *res_dst, __int64 *gold_dst, int *mask,
+ int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ int kmask = mask[i / elems_in_vector];
+ int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ if (gold_dst[i] != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %I64d, actual %I64d, kmask=%d\n", gold_dst[i],
+ res_dst[i], kmask_bit);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm512_mask_i32scatter_epi64();
+ error |= check(1, dst512_i, gold_dst512_i, mask512, 8);
+
+ do_mm512_mask_i32scatter_pd();
+ error |= check(2, (__int64 *)dst512_f, (__int64 *)gold_dst512_f, mask512, 8);
+
+ init_data();
+
+ do_mm512_i32scatter_epi64();
+ error |= check(3, dst512_i, full_gold_dst512_i, full_mask512, 8);
+
+ do_mm512_i32scatter_pd();
+ error |= check(4, (__int64 *)dst512_f, (__int64 *)full_gold_dst512_f,
+ full_mask512, 8);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.reference_output b/SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/i32scatter_64_m512.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.c b/SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.c
new file mode 100644
index 00000000..ad952caa
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.c
@@ -0,0 +1,148 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_i64gather_epi32()
+ * _mm512_mask_i64gather_epi32()
+ * _mm512_i64gather_ps()
+ * _mm512_mask_i64gather_ps()
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+#define NUM (256 * 256)
+#define SCALE 4
+
+float dst512_f[NUM];
+float dst_f[NUM];
+int dst512_i[NUM];
+int dst_i[NUM];
+float src_f[NUM];
+int src_i[NUM];
+int mask512[NUM / 8];
+__int64 g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ g_index[i] = MIN(i * 17 & 0xce, NUM - 1);
+ src_f[g_index[i]] = src_i[g_index[i]] = i;
+
+ dst_i[i] = dst_f[i] = -i;
+ dst512_i[i] = -i;
+ dst512_f[i] = -i;
+
+ if (i % 8 == 0) {
+ mask512[i / 8] = (i * 31) & 0xff;
+ }
+ }
+}
+
+void do_mm512_mask_i64gather_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i));
+ __m256i old_dst = _mm256_loadu_si256((const __m256i *)(dst_i + i));
+
+ __m256i gtr =
+ _mm512_mask_i64gather_epi32(old_dst, mask512[i / 8], ind, src_i, SCALE);
+
+ _mm256_storeu_si256((__m256i *)(dst512_i + i), gtr);
+ }
+}
+
+void do_mm512_mask_i64gather_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i));
+ __m256 old_dst = _mm256_loadu_ps(dst_f + i);
+ __m256 gtr =
+ _mm512_mask_i64gather_ps(old_dst, mask512[i / 8], ind, src_f, SCALE);
+ _mm256_storeu_ps(dst512_f + i, gtr);
+ }
+}
+
+void do_mm512_i64gather_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i));
+ __m256i gtr = _mm512_i64gather_epi32(ind, src_i, SCALE);
+ _mm256_storeu_si256((__m256i *)(dst512_i + i), gtr);
+ }
+}
+
+void do_mm512_i64gather_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i));
+ __m256 gtr = _mm512_i64gather_ps(ind, src_f, SCALE);
+ _mm256_storeu_ps(dst512_f + i, gtr);
+ }
+}
+
+int checkm(int id, int *res_dst, int *pass_thru_vals, int *mask, int *src,
+ int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ int kmask = mask[i / elems_in_vector];
+ int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ int v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i];
+
+ if (v != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %d, actual %d\n", v, res_dst[i]);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int check(int id, int *res_dst, int *src, int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+
+ int v = src[g_index[i]];
+
+ if (v != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %d, actual %d\n", v, res_dst[i]);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm512_mask_i64gather_epi32();
+ error |= checkm(1, dst512_i, dst_i, mask512, src_i, 8);
+
+ do_mm512_mask_i64gather_ps();
+ error |= checkm(2, (int *)dst512_f, (int *)dst_f, mask512, (int *)src_f, 8);
+
+ init_data();
+
+ do_mm512_i64gather_epi32();
+ error |= check(3, dst512_i, src_i, 8);
+
+ do_mm512_i64gather_ps();
+ error |= check(4, (int *)dst512_f, (int *)src_f, 8);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.reference_output b/SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/i64gather_32_m512.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.c b/SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.c
new file mode 100644
index 00000000..2a147ec2
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.c
@@ -0,0 +1,132 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_i64scatter_epi32()
+ * _mm512_mask_i64scatter_epi32()
+ * _mm512_i64scatter_ps()
+ * _mm512_mask_i64scatter_ps()
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+#define NUM (256 * 256)
+#define SCALE 4
+
+float dst512_f[NUM], gold_dst512_f[NUM], full_gold_dst512_f[NUM];
+int dst512_i[NUM], gold_dst512_i[NUM], full_gold_dst512_i[NUM];
+int mask512[NUM / 8];
+int full_mask512[NUM / 8];
+__int64 g_index[NUM];
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ dst512_i[i] = -1;
+ dst512_f[i] = -1;
+
+ g_index[i] = i * 2;
+ if (g_index[i] >= NUM) {
+ g_index[i] = NUM - 1 - (i - NUM / 2) * 2;
+ }
+
+ if (i % 8 == 0) {
+ mask512[i / 8] = (i * 31) & 0xff;
+ full_mask512[i / 8] = 0xff;
+ }
+
+ if ((mask512[i / 8] >> (i % 8)) & 0x1) {
+ gold_dst512_i[g_index[i]] = gold_dst512_f[g_index[i]] = i;
+ } else {
+ gold_dst512_i[g_index[i]] = gold_dst512_f[g_index[i]] = -1;
+ }
+ full_gold_dst512_i[g_index[i]] = full_gold_dst512_f[g_index[i]] = i;
+ }
+}
+
+void do_mm512_mask_i64scatter_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i));
+ __m256i val =
+ _mm256_set_epi32(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm512_mask_i64scatter_epi32(dst512_i, mask512[i / 8], ind, val, SCALE);
+ }
+}
+
+void do_mm512_mask_i64scatter_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i));
+ __m256 val =
+ _mm256_set_ps(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm512_mask_i64scatter_ps(dst512_f, mask512[i / 8], ind, val, SCALE);
+ }
+}
+
+void do_mm512_i64scatter_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i));
+ __m256i val =
+ _mm256_set_epi32(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm512_i64scatter_epi32(dst512_i, ind, val, SCALE);
+ }
+}
+
+void do_mm512_i64scatter_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m512i ind = _mm512_loadu_si512((const __m512i *)(g_index + i));
+ __m256 val =
+ _mm256_set_ps(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm512_i64scatter_ps(dst512_f, ind, val, SCALE);
+ }
+}
+
+int check(int id, int *res_dst, int *gold_dst, int *mask, int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ int kmask = mask[i / elems_in_vector];
+ int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ if (gold_dst[i] != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %d, actual %d, kmask=%d\n", gold_dst[i],
+ res_dst[i], kmask_bit);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm512_mask_i64scatter_epi32();
+ error |= check(1, dst512_i, gold_dst512_i, mask512, 8);
+
+ do_mm512_mask_i64scatter_ps();
+ error |= check(2, (int *)dst512_f, (int *)gold_dst512_f, mask512, 8);
+
+ init_data();
+
+ do_mm512_i64scatter_epi32();
+ error |= check(3, dst512_i, full_gold_dst512_i, full_mask512, 8);
+
+ do_mm512_i64scatter_ps();
+ error |=
+ check(4, (int *)dst512_f, (int *)full_gold_dst512_f, full_mask512, 8);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.reference_output b/SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/i64scatter_32_m512.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.c b/SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.c
new file mode 100644
index 00000000..35cad3a0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.c
@@ -0,0 +1,479 @@
+
+/*
+ * Test 128 and 256-bit load and store intrinsics,
+ * with masked and zero-masked forms, by comparing
+ * their output with the corresponding 512-bit intrinsic.
+ *
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_load_si512()
+ * _mm512_store_si512()
+ * _mm256_storeu_ps()
+ * _mm256_storeu_si256()
+ * _mm_storeu_ps()
+ * _mm_storeu_si128()
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+V512 fsrc1;
+V512 fsrc2;
+V512 fsrc_non_negative;
+V512 fsrc_arr[2];
+
+V512 dsrc1;
+V512 dsrc2;
+V512 dsrc_non_negative;
+V512 dsrc_arr[2];
+
+V512 isrc1;
+V512 isrc2;
+V512 isrc_arr[2];
+
+static void NOINLINE init() {
+ volatile int i;
+
+ for (i = 0; i < 16; i++) {
+ fsrc1.f32[i] = (float)(i + 1);
+ fsrc2.f32[i] = 4.0f * (float)(i + 1);
+ fsrc_non_negative.f32[i] = 2.0f * (float)(i);
+ isrc2.s32[i] = (3 * i) - 17;
+ }
+
+ fsrc_arr[0] = fsrc2;
+ fsrc_arr[1] = fsrc1;
+
+ for (i = 0; i < 64; i++) {
+ isrc1.s8[i] = 2 * i + 1;
+ if (i % 3) {
+ isrc1.s8[i] = -isrc1.s8[i];
+ }
+ }
+
+ isrc_arr[0] = isrc2;
+ isrc_arr[1] = isrc1;
+
+ for (i = 0; i < 8; i++) {
+ dsrc1.f64[i] = (double)(-i - 1);
+ dsrc2.f64[i] = 3.0 * (double)(-i - 1);
+ dsrc_non_negative.f64[i] = 4.0f * (double)(i);
+ }
+
+ dsrc_arr[0] = dsrc2;
+ dsrc_arr[1] = dsrc1;
+}
+
+/*
+ * Use "soft update" between tests to make compiler think src was updated.
+ * Prevents PRE'ing a load of src, thus allowing ciscization.
+ * Also prevents PRE'ing intrinsic operations, ensuring we
+ * execute the intended instructions.
+ */
+volatile int vol0 = 0;
+#define soft_v512_update(var) (var).xmmi[vol0] = (var).xmmi[vol0]
+
+void NOINLINE do_load_and_loadu_pd() {
+ V512 xmm_res, ymm_res, zmm_res;
+ __mmask8 k8 = 0x7e;
+
+ /* Non-masked. */
+
+ soft_v512_update(dsrc1);
+ zmm_res.zmmd = _mm512_load_pd(&dsrc1.zmmd);
+
+ soft_v512_update(dsrc_arr[0]);
+ zmm_res.zmmd = _mm512_loadu_pd(&dsrc_arr[0].f64[1]);
+
+ /* Masked. */
+
+ zmm_res.zmmd = _mm512_setzero_pd();
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(dsrc1);
+ zmm_res.zmmd = _mm512_mask_load_pd(zmm_res.zmmd, k8, &dsrc1.zmmd);
+
+ zmm_res.zmmd = _mm512_setzero_pd();
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(dsrc_arr[0]);
+ zmm_res.zmmd = _mm512_mask_loadu_pd(zmm_res.zmmd, k8, &dsrc_arr[0].f64[3]);
+
+ /* Zero-masked. */
+
+ zmm_res.zmmd = _mm512_set1_pd(1.0);
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(dsrc1);
+ zmm_res.zmmd = _mm512_maskz_load_pd(k8, &dsrc1.zmmd);
+ soft_v512_update(dsrc1);
+
+ zmm_res.zmmd = _mm512_set1_pd(1.0);
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+}
+
+void NOINLINE do_store_and_storeu_pd() {
+ V512 xmm_res[2], ymm_res[2], zmm_res[2];
+ __mmask8 k8 = 0xef;
+
+ /* Non-masked. */
+
+ zmm_res[0].zmmd = _mm512_set1_pd(1.0);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(dsrc1);
+ _mm512_store_pd(&zmm_res[0].zmmd, dsrc1.zmmd);
+
+ soft_v512_update(dsrc1);
+ _mm512_storeu_pd(&zmm_res[0].f64[1], dsrc1.zmmd);
+
+ /* Masked. */
+
+ zmm_res[0].zmmd = _mm512_set1_pd(1.0);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(dsrc1);
+ _mm512_mask_store_pd(&zmm_res[0].zmmd, k8, dsrc1.zmmd);
+
+ zmm_res[0].zmmd = _mm512_set1_pd(1.0);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(dsrc1);
+ _mm512_mask_storeu_pd(&zmm_res[0].f64[1], k8, dsrc1.zmmd);
+}
+
+void NOINLINE do_load_and_loadu_ps() {
+ V512 xmm_res, ymm_res, zmm_res;
+ __mmask16 k16 = 0xff7e;
+ __mmask8 k8 = (__mmask8)k16;
+
+ /* Non-masked. */
+
+ soft_v512_update(fsrc1);
+ zmm_res.zmm = _mm512_load_ps(&fsrc1.zmm);
+
+ soft_v512_update(fsrc_arr[0]);
+ zmm_res.zmm = _mm512_loadu_ps(&fsrc_arr[0].f32[3]);
+
+ /* Masked. */
+
+ zmm_res.zmm = _mm512_setzero_ps();
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(fsrc1);
+ zmm_res.zmm = _mm512_mask_load_ps(zmm_res.zmm, k16, &fsrc1.zmm);
+
+ zmm_res.zmm = _mm512_setzero_ps();
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(fsrc_arr[0]);
+ zmm_res.zmm = _mm512_mask_loadu_ps(zmm_res.zmm, k16, &fsrc_arr[0].f32[5]);
+
+ /* Zero-masked. */
+
+ zmm_res.zmm = _mm512_set1_ps(1.0f);
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(fsrc1);
+ zmm_res.zmm = _mm512_maskz_load_ps(k16, &fsrc1.zmm);
+
+ zmm_res.zmm = _mm512_set1_ps(1.0f);
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(fsrc_arr[0]);
+ zmm_res.zmm = _mm512_maskz_loadu_ps(k16, &fsrc_arr[0].f32[5]);
+}
+
+void NOINLINE do_store_and_storeu_ps() {
+ V512 xmm_res[2], ymm_res[2], zmm_res[2];
+ __mmask16 k16 = 0xffef;
+ __mmask8 k8 = (__mmask8)k16;
+
+ /* Non-masked. */
+
+ zmm_res[0].zmm = _mm512_set1_ps(1.0f);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(fsrc1);
+ _mm512_store_ps(&zmm_res[0].zmm, fsrc1.zmm);
+
+ soft_v512_update(fsrc1);
+ _mm512_storeu_ps(&zmm_res[0].f32[1], fsrc1.zmm);
+ soft_v512_update(fsrc1);
+ _mm256_storeu_ps(&ymm_res[0].f32[1], fsrc1.ymm[0]);
+ soft_v512_update(fsrc1);
+ _mm_storeu_ps(&xmm_res[0].f32[1], fsrc1.xmm[0]);
+
+ check_equal_nsf(&ymm_res[0].f32[1], &zmm_res[0].f32[1], 8, "_mm256_storeu_ps",
+ __LINE__);
+ check_equal_nsf(&xmm_res[0].f32[1], &zmm_res[0].f32[1], 4, "_mm_storeu_ps",
+ __LINE__);
+
+ /* Masked. */
+
+ zmm_res[0].zmm = _mm512_set1_ps(1.0f);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(fsrc1);
+ _mm512_mask_store_ps(&zmm_res[0].zmm, k8, fsrc1.zmm);
+
+ zmm_res[0].zmm = _mm512_set1_ps(1.0f);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(fsrc1);
+ _mm512_mask_storeu_ps(&zmm_res[0].f32[1], k8, fsrc1.zmm);
+}
+
+void NOINLINE do_load_and_loadu_epi32() {
+ V512 xmm_res, ymm_res, zmm_res;
+ __mmask16 k16 = 0xffef;
+ __mmask8 k8 = (__mmask8)k16;
+
+ /* Non-masked. */
+
+ soft_v512_update(isrc1);
+ zmm_res.zmmi = _mm512_load_epi32(&isrc1.zmmi);
+
+ soft_v512_update(isrc1);
+ ymm_res.zmmi = _mm512_load_si512(&isrc1.zmmi);
+ check_equal_nd(&ymm_res, &zmm_res, 16, "_mm512_load_si512", __LINE__);
+
+ soft_v512_update(isrc_arr[0]);
+ zmm_res.zmmi = _mm512_loadu_si512(&isrc_arr[0].s32[1]);
+
+ /* Masked. */
+
+ zmm_res.zmmi = _mm512_setzero_epi32();
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(isrc1);
+ zmm_res.zmmi = _mm512_mask_load_epi32(zmm_res.zmmi, k16, &isrc1.zmmi);
+
+ zmm_res.zmmi = _mm512_setzero_epi32();
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(isrc_arr[0]);
+ zmm_res.zmmi =
+ _mm512_mask_loadu_epi32(zmm_res.zmmi, k16, &isrc_arr[0].s32[3]);
+
+ /* Zero-masked. */
+
+ zmm_res.zmmi = _mm512_set1_epi32(-7);
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(isrc1);
+ zmm_res.zmmi = _mm512_maskz_load_epi32(k16, &isrc1.zmmi);
+
+ zmm_res.zmmi = _mm512_set1_epi32(11);
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(isrc_arr[0]);
+ zmm_res.zmmi = _mm512_maskz_loadu_epi32(k16, &isrc_arr[0].s32[1]);
+}
+
+void NOINLINE do_store_and_storeu_epi32() {
+ V512 xmm_res[2], ymm_res[2], zmm_res[2];
+ __mmask16 k16 = 0xfeff;
+ __mmask8 k8 = (__mmask8)k16;
+
+ /* Non-masked. */
+
+ zmm_res[0].zmmi = _mm512_set1_epi32(-101);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(isrc1);
+ _mm512_store_epi32(&zmm_res[0].zmmi, isrc1.zmmi);
+ soft_v512_update(isrc1);
+ _mm256_store_si256((__m256i *)&ymm_res[0].s32[0], isrc1.ymmi[0]);
+ soft_v512_update(isrc1);
+ _mm_store_si128((__m128i *)&xmm_res[0].s32[0], isrc1.xmmi[0]);
+
+ check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_store_epi32", __LINE__);
+ check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_store_epi32", __LINE__);
+
+ soft_v512_update(isrc1);
+ ymm_res[0].zmmi = _mm512_setzero_si512();
+ _mm512_store_si512(&ymm_res[0].zmmi, isrc1.zmmi);
+ check_equal_nd(&ymm_res, &zmm_res, 16, "_mm512_store_si512", __LINE__);
+
+ soft_v512_update(isrc1);
+ _mm512_storeu_si512(&zmm_res[0].s32[1], isrc1.zmmi);
+ soft_v512_update(isrc1);
+ _mm256_storeu_si256((__m256i *)&ymm_res[0].s32[1], isrc1.ymmi[0]);
+ soft_v512_update(isrc1);
+ _mm_storeu_si128((__m128i *)&xmm_res[0].s32[1], isrc1.xmmi[0]);
+
+ check_equal_nd(&ymm_res[0].s32[1], &zmm_res[0].s32[1], 8,
+ "_mm256_storeu_si256", __LINE__);
+ check_equal_nd(&xmm_res[0].s32[1], &zmm_res[0].s32[1], 4, "_mm_storeu_si128",
+ __LINE__);
+
+ /* Masked. */
+
+ zmm_res[0].zmmi = _mm512_set1_epi32(999);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(isrc1);
+ _mm512_mask_store_epi32(&zmm_res[0].zmmi, k16, isrc1.zmmi);
+
+ zmm_res[0].zmmi = _mm512_set1_epi32(-3);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(isrc1);
+ _mm512_mask_storeu_epi32(&zmm_res[0].s32[1], k16, isrc1.zmmi);
+}
+
+void NOINLINE do_load_and_loadu_epi64() {
+ V512 xmm_res, ymm_res, zmm_res;
+ __mmask16 k8 = 0xef;
+
+ /* Non-masked. */
+
+ soft_v512_update(isrc1);
+ zmm_res.zmmi = _mm512_load_epi64(&isrc1.zmmi);
+
+ soft_v512_update(isrc_arr[0]);
+ zmm_res.zmmi = _mm512_loadu_si512(&isrc_arr[0].s64[1]);
+
+ /* Masked. */
+
+ zmm_res.zmmi = _mm512_setzero_epi32();
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(isrc1);
+ zmm_res.zmmi = _mm512_mask_load_epi64(zmm_res.zmmi, k8, &isrc1.zmmi);
+
+ zmm_res.zmmi = _mm512_setzero_epi32();
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+
+ soft_v512_update(isrc_arr[0]);
+ zmm_res.zmmi = _mm512_mask_loadu_epi64(zmm_res.zmmi, k8, &isrc_arr[0].s64[3]);
+
+ /* Zero-masked. */
+
+ zmm_res.zmmi = _mm512_set1_epi64(-7);
+ ymm_res = zmm_res;
+ xmm_res = zmm_res;
+}
+
+void NOINLINE do_store_and_storeu_epi64() {
+ V512 xmm_res[2], ymm_res[2], zmm_res[2];
+ __mmask8 k8 = 0xfe;
+
+ /* Non-masked. */
+
+ zmm_res[0].zmmi = _mm512_set1_epi32(-101);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(isrc1);
+ _mm512_store_epi64(&zmm_res[0].zmmi, isrc1.zmmi);
+ soft_v512_update(isrc1);
+ _mm256_store_si256((__m256i *)&ymm_res[0].s64[0], isrc1.ymmi[0]);
+ soft_v512_update(isrc1);
+ _mm_store_si128((__m128i *)&xmm_res[0].s64[0], isrc1.xmmi[0]);
+
+ check_equal_nq(&ymm_res, &zmm_res, 4, "_mm256_store_epi64", __LINE__);
+ check_equal_nq(&xmm_res, &zmm_res, 2, "_mm_store_si256", __LINE__);
+
+ soft_v512_update(isrc1);
+ _mm512_storeu_si512(&zmm_res[0].s64[1], isrc1.zmmi);
+
+ /* Masked. */
+
+ zmm_res[0].zmmi = _mm512_set1_epi32(999);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(isrc1);
+ _mm512_mask_store_epi64(&zmm_res[0].zmmi, k8, isrc1.zmmi);
+
+ zmm_res[0].zmmi = _mm512_set1_epi32(-3);
+ zmm_res[1] = zmm_res[0];
+ ymm_res[0] = zmm_res[0];
+ ymm_res[1] = zmm_res[0];
+ xmm_res[0] = zmm_res[0];
+ xmm_res[1] = zmm_res[0];
+
+ soft_v512_update(isrc1);
+ _mm512_mask_storeu_epi64(&zmm_res[0].s64[1], k8, isrc1.zmmi);
+}
+
+int main() {
+ init();
+
+ do_load_and_loadu_pd();
+ do_load_and_loadu_ps();
+ do_load_and_loadu_epi32();
+ do_store_and_storeu_epi32();
+ do_load_and_loadu_epi64();
+
+ do_store_and_storeu_pd();
+ do_store_and_storeu_ps();
+
+ do_store_and_storeu_epi64();
+
+ if (n_errs != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.reference_output b/SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/load_store_xyz.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512F/mask_mov.c b/SingleSource/UnitTests/Vector/AVX512F/mask_mov.c
new file mode 100644
index 00000000..4b6aeaaa
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/mask_mov.c
@@ -0,0 +1,135 @@
+/*
+ * Test mask_mov and maskz_mov intructions
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+__m512i i1;
+__m512i i2;
+__m512i i3;
+__m512i i4;
+
+__m512 f1;
+__m512 f2;
+__m512 f3;
+__m512 f4;
+
+__m512d d1;
+__m512d d2;
+__m512d d3;
+__m512d d4;
+
+volatile int vol = 0; /* To prevent optimizations */
+
+void NOINLINE set_nonzero(void *vp, int c) {
+ int i;
+ V512 *v = (V512 *)vp;
+
+ for (i = 0; i < 16; i++) {
+ v->u32[i] = 10 * i * i - 3 * i + c + vol;
+ if (v->u32[i] == 0) {
+ v->u32[i] = 1234;
+ }
+ }
+}
+
+void NOINLINE check_equal32(void *vgot, void *vexpected, void *vexpected_orig,
+ int mask, char *banner) {
+ int i;
+ V512 *got = (V512 *)vgot;
+ V512 *expected = (V512 *)vexpected;
+ V512 *orig = (V512 *)vexpected_orig;
+
+ for (i = 0; i < 16; i++) {
+ int ans = (mask & (1 << i)) ? expected->s32[i] : orig->s32[i];
+ if (got->s32[i] != ans) {
+ printf("ERROR: %s failed -- 0x%0.8x != 0x%0.8x at element [%d]\n",
+ banner ? banner : "", got->s32[i], ans, i);
+ n_errs++;
+ break;
+ }
+ }
+}
+
+void NOINLINE check_equal64(void *vgot, void *vexpected, void *vexpected_orig,
+ int mask, char *banner) {
+ int i;
+ V512 *got = (V512 *)vgot;
+ V512 *expected = (V512 *)vexpected;
+ V512 *orig = (V512 *)vexpected_orig;
+
+ for (i = 0; i < 8; i++) {
+ __int64 ans = (mask & (1 << i)) ? expected->s64[i] : orig->s64[i];
+ if (got->s64[i] != ans) {
+ printf("ERROR: %s failed -- %0.16" PRIx64 " != %0.16" PRIx64
+ " at element [%d]\n",
+ banner ? banner : "", got->s64[i], ans, i);
+ n_errs++;
+ break;
+ }
+ }
+}
+
+void NOINLINE do_mask_mov_32() {
+ int k = 0xf2f3;
+ __m512i zeroi = _mm512_setzero_epi32();
+
+ set_nonzero(&i1, -97);
+ set_nonzero(&i2, 22);
+ set_nonzero(&i3, 22);
+
+ i3 = _mm512_mask_mov_epi32(i1, k, i2);
+ check_equal32(&i3, &i2, &i1, k, "_mm512_mask_mov_epi32");
+
+ i4 = _mm512_maskz_mov_epi32(k, i1);
+ check_equal32(&i4, &i1, &zeroi, k, "_mm512_maskz_mov_epi32");
+
+ set_nonzero(&f1, -96);
+ set_nonzero(&f2, 21);
+ set_nonzero(&f3, 1400);
+
+ f3 = _mm512_mask_mov_ps(f1, k, f2);
+ check_equal32(&f3, &f2, &f1, k, "_mm512_mask_mov_ps");
+
+ f4 = _mm512_maskz_mov_ps(k, f1);
+ check_equal32(&f4, &f1, &zeroi, k, "_mm512_maskz_mov_ps");
+}
+
+void NOINLINE do_mask_mov_64() {
+ __mmask8 k = 0x59;
+ __m512i zeroi = _mm512_setzero_epi32();
+
+ set_nonzero(&i1, -97);
+ set_nonzero(&i2, 22);
+ set_nonzero(&i3, 22);
+
+ i3 = _mm512_mask_mov_epi64(i1, k, i2);
+ check_equal64(&i3, &i2, &i1, k, "_mm512_mask_mov_epi64");
+
+ i4 = _mm512_maskz_mov_epi64(k, i1);
+ check_equal64(&i4, &i1, &zeroi, k, "_mm512_maskz_mov_epi64");
+
+ set_nonzero(&d1, -96);
+ set_nonzero(&d2, 21);
+ set_nonzero(&d3, 1400);
+
+ d3 = _mm512_mask_mov_pd(d1, k, d2);
+ check_equal64(&d3, &d2, &d1, k, "_mm512_mask_mov_pd");
+
+ d4 = _mm512_maskz_mov_pd(k, d1);
+ check_equal64(&d4, &d1, &zeroi, k, "_mm512_maskz_mov_pd");
+}
+
+int main() {
+ do_mask_mov_32();
+ do_mask_mov_64();
+
+ if (n_errs != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512F/mask_mov.reference_output b/SingleSource/UnitTests/Vector/AVX512F/mask_mov.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/mask_mov.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512F/movedup.c b/SingleSource/UnitTests/Vector/AVX512F/movedup.c
new file mode 100644
index 00000000..7000115b
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/movedup.c
@@ -0,0 +1,213 @@
+/*
+ * Test movedup and moveldup instructions.
+ * Here we check for _mm512_[mask|maskz]move[l|h]dup intrinsics
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+int verbose = 0;
+
+__m512 f1;
+__m512 f2;
+__m512 f3;
+__m512 f4;
+
+__m512d d1;
+__m512d d2;
+__m512d d3;
+__m512d d4;
+
+volatile int vol = 0; /* To prevent optimizations */
+
+void NOINLINE init() {
+ int i;
+ V512 *pf1 = (V512 *)&f1;
+ V512 *pf2 = (V512 *)&f2;
+ V512 *pd1 = (V512 *)&d1;
+ V512 *pd2 = (V512 *)&d2;
+
+ for (i = 0; i < 16; i++) {
+ pf1->f32[i] = 17 + ((i & 1) ? 1 : -1) * i + vol;
+ pf2->f32[i] = -(100 + ((i & 3) == 3 ? 1 : -1) * i + vol);
+ }
+
+ for (i = 0; i < 8; i++) {
+ pd1->f64[i] = pf1->f32[i];
+ pd2->f64[i] = -pf2->f32[i];
+ }
+}
+
+void NOINLINE check_equal32(void *vgot, void *vexpected, void *vexpected_orig,
+ int mask, char *banner) {
+ int i;
+ V512 *got = (V512 *)vgot;
+ V512 *expected = (V512 *)vexpected;
+ V512 *orig = (V512 *)vexpected_orig;
+
+ for (i = 0; i < 16; i++) {
+ int ans = (mask & (1 << i)) ? expected->u32[i] : orig->u32[i];
+ if (got->u32[i] != ans) {
+ printf("ERROR: %s failed -- 0x%0.8x != 0x%0.8x at element [%d]\n",
+ banner ? banner : "", got->u32[i], ans, i);
+ n_errs++;
+ break;
+ }
+ }
+}
+
+void NOINLINE check_equal64(void *vgot, void *vexpected, void *vexpected_orig,
+ int mask, char *banner) {
+ int i;
+ V512 *got = (V512 *)vgot;
+ V512 *expected = (V512 *)vexpected;
+ V512 *orig = (V512 *)vexpected_orig;
+
+ for (i = 0; i < 8; i++) {
+ __int64 ans = (mask & (1 << i)) ? expected->u64[i] : orig->u64[i];
+ if (got->u64[i] != ans) {
+ printf("ERROR: %s failed -- %0.16" PRIx64 " != %0.16" PRIx64
+ " at element [%d]\n",
+ banner ? banner : "", got->u64[i], ans, i);
+ n_errs++;
+ break;
+ }
+ }
+}
+
+void NOINLINE emulate_movedup_pd(void *presult, const void *p1, int mask,
+ const void *p2, int zero_masking) {
+ int i;
+ V512 *result = (V512 *)presult;
+ V512 *v1 = (V512 *)p1;
+ V512 *v2 = (V512 *)p2;
+
+ for (i = 0; i < 8; i++) {
+
+ if (((1 << i) & mask) == 0) {
+ result->u64[i] = zero_masking ? 0 : v1->u64[i];
+ } else {
+ int src_index = i & 0xfe; // even index
+ result->u64[i] = v2->u64[src_index];
+ }
+ }
+}
+
+void NOINLINE emulate_moveldup_ps(void *presult, const void *p1, int mask,
+ const void *p2, int zero_masking) {
+ int i;
+ V512 *result = (V512 *)presult;
+ V512 *v1 = (V512 *)p1;
+ V512 *v2 = (V512 *)p2;
+
+ for (i = 0; i < 16; i++) {
+
+ if (((1 << i) & mask) == 0) {
+ result->u32[i] = zero_masking ? 0 : v1->u32[i];
+ } else {
+ int src_index = i & 0xfe; // even index
+ result->u32[i] = v2->u32[src_index];
+ }
+ }
+}
+
+void NOINLINE emulate_movehdup_ps(void *presult, const void *p1, int mask,
+ const void *p2, int zero_masking) {
+ int i;
+ V512 *result = (V512 *)presult;
+ V512 *v1 = (V512 *)p1;
+ V512 *v2 = (V512 *)p2;
+
+ for (i = 0; i < 16; i++) {
+
+ if (((1 << i) & mask) == 0) {
+ result->u32[i] = zero_masking ? 0 : v1->u32[i];
+ } else {
+ int src_index = (i & 0xfe) + 1; // odd index
+ result->u32[i] = v2->u32[src_index];
+ }
+ }
+}
+
+void NOINLINE do_movedup_pd() {
+ if (verbose) {
+ printf("BEGIN do_movedup_pd\n");
+ }
+
+ d3 = _mm512_movedup_pd(d2);
+ emulate_movedup_pd(&d4, (void *)0, 0xff, &d2, 0);
+ check_equal64(&d3, &d4, (void *)0, 0xff, "_mm512_movedup_pd");
+
+ d3 = _mm512_maskz_movedup_pd(0xc5, d2);
+ emulate_movedup_pd(&d4, (void *)0, 0xc5, &d2, 1);
+ check_equal64(&d3, &d4, (void *)0, 0xff, "_mm512_maskz_movedup_pd");
+
+ d3 = _mm512_mask_movedup_pd(d1, 0xda, d2);
+ emulate_movedup_pd(&d4, &d1, 0xda, &d2, 0);
+ check_equal64(&d3, &d4, (void *)0, 0xff, "_mm512_mask_movedup_pd");
+
+ if (verbose) {
+ printf("DONE\n");
+ }
+}
+
+void NOINLINE do_moveldup_ps() {
+ if (verbose) {
+ printf("BEGIN do_moveldup_ps\n");
+ }
+
+ f3 = _mm512_moveldup_ps(f2);
+ emulate_moveldup_ps(&f4, (void *)0, 0xffff, &f2, 0);
+ check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_moveldup_ps");
+
+ f3 = _mm512_maskz_moveldup_ps(0x79fa, f2);
+ emulate_moveldup_ps(&f4, (void *)0, 0x79fa, &f2, 1);
+ check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_maskz_moveldup_ps");
+
+ f3 = _mm512_mask_moveldup_ps(f1, 0x53da, f2);
+ emulate_moveldup_ps(&f4, &f1, 0x53da, &f2, 0);
+ check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_mask_moveldup_ps");
+
+ if (verbose) {
+ printf("DONE\n");
+ }
+}
+
+void NOINLINE do_movehdup_ps() {
+ if (verbose) {
+ printf("BEGIN do_movehdup_ps\n");
+ }
+
+ f3 = _mm512_movehdup_ps(f2);
+ emulate_movehdup_ps(&f4, (void *)0, 0xffff, &f2, 0);
+ check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_movehdup_ps");
+
+ f3 = _mm512_maskz_movehdup_ps(0x79fa, f2);
+ emulate_movehdup_ps(&f4, (void *)0, 0x79fa, &f2, 1);
+ check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_maskz_movehdup_ps");
+
+ f3 = _mm512_mask_movehdup_ps(f1, 0x79fa, f2);
+ emulate_movehdup_ps(&f4, &f1, 0x79fa, &f2, 0);
+ check_equal32(&f3, &f4, (void *)0, 0xffff, "_mm512_mask_movehdup_ps");
+
+ if (verbose) {
+ printf("DONE\n");
+ }
+}
+
+int main(int argc, char *argv[]) {
+ init();
+
+ do_movedup_pd();
+
+ do_moveldup_ps();
+ do_movehdup_ps();
+
+ if (n_errs != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512F/movedup.reference_output b/SingleSource/UnitTests/Vector/AVX512F/movedup.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/movedup.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512F/store.c b/SingleSource/UnitTests/Vector/AVX512F/store.c
new file mode 100644
index 00000000..4776d414
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/store.c
@@ -0,0 +1,144 @@
+/*
+ * Test store instructions.
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm512_store_ps()
+ * _mm512_mask_store_ps()
+ * _mm512_store_epi32()
+ * _mm512_mask_store_epi32()
+ * _mm512_store_epi64()
+ * _mm512_mask_store_epi64()
+ * _mm512_store_pd()
+ * _mm512_mask_store_pd()
+ */
+
+#include "m512_test_util.h"
+#include <stdio.h>
+
+__m512 v1;
+__m512i i1;
+__m512d d1;
+V512 tval;
+int ALIGNTO(64) dest_memory[16];
+unsigned int et_memory[16];
+void *pdst = (void *)&dest_memory;
+
+unsigned int initial = 0;
+void NOINLINE set_nonzero(void *vp, int c) {
+ int i;
+ V512 *v = (V512 *)vp;
+
+ for (i = 0; i < 16; i++) {
+ v->u32[i] = 10 * i * i - 3 * i + c;
+ if (v->u32[i] == 0) {
+ v->u32[i] = 1234;
+ }
+ et_memory[i] = initial;
+ dest_memory[i] = initial;
+ }
+}
+
+void NOINLINE set_generic32_result(void *vp1, int mask, int subset) {
+ int i, max;
+ V512 *v1 = (V512 *)vp1;
+
+ max = (subset == 1 ? 1 : (subset == 2 ? 4 : 16));
+ for (i = 0; i < max; i++) {
+ if ((mask & 0x1) != 0) {
+ et_memory[i] = v1->u32[i];
+ }
+ mask >>= 1;
+ }
+}
+
+void NOINLINE set_generic64_result(void *vp1, int mask, int subset) {
+ int i, max;
+ V512 *v1 = (V512 *)vp1;
+
+ max = (subset == 1 ? 1 : (subset == 2 ? 4 : 8));
+ for (i = 0; i < max; i++) {
+ if ((mask & 0x1) != 0) {
+ ((U64 *)&et_memory)[i] = v1->u64[i];
+ }
+ mask >>= 1;
+ }
+}
+
+void NOINLINE do_store_ps() {
+ /* full vector */
+ set_nonzero(&v1, 11);
+ set_generic32_result(&v1, 0xffff, 0);
+ _mm512_store_ps(pdst, v1);
+ check_equal_nd(pdst, &et_memory, 16, "_mm512_store_ps - full vector",
+ __LINE__);
+
+ /* vector with write mask */
+ set_nonzero(&v1, 5);
+ set_generic32_result(&v1, 0xf00f, 0);
+ _mm512_mask_store_ps(pdst, 0xf00f, v1);
+ check_equal_nd(pdst, &et_memory, 16, "_mm512_mask_store_ps - full vector",
+ __LINE__);
+}
+
+void NOINLINE do_store_epi32() {
+ /* full vector */
+ set_nonzero(&i1, 11);
+ set_generic32_result(&i1, 0xffff, 0);
+ _mm512_store_epi32(pdst, i1);
+ check_equal_nd(pdst, &et_memory, 16, "_mm512_store_epi32 - full vector",
+ __LINE__);
+
+ /* vector with write mask */
+ set_nonzero(&i1, 5);
+ set_generic32_result(&i1, 0xf00f, 0);
+ _mm512_mask_store_epi32(pdst, 0xf00f, i1);
+ check_equal_nd(pdst, &et_memory, 16, "_mm512_mask_store_epi32 - full vector",
+ __LINE__);
+}
+
+void NOINLINE do_store_epi64() {
+ /* full vector */
+ set_nonzero(&i1, 11);
+ set_generic64_result(&i1, 0xff, 0);
+ _mm512_store_epi64(pdst, i1);
+ check_equal_nq(pdst, &et_memory, 8, "_mm512_store_epi64 - full vector",
+ __LINE__);
+
+ /* vector with write mask */
+ set_nonzero(&i1, 5);
+ set_generic64_result(&i1, 0x60, 0);
+ _mm512_mask_store_epi64(pdst, 0x60, i1);
+ check_equal_nq(pdst, &et_memory, 8, "_mm512_mask_store_epi64 - full vector",
+ __LINE__);
+}
+
+void NOINLINE do_store_pd() {
+ /* full vector */
+ set_nonzero(&d1, 11);
+ set_generic64_result(&d1, 0xff, 0);
+ _mm512_store_pd(pdst, d1);
+ check_equal_nq(pdst, &et_memory, 8, "_mm512_store_pd - full vector",
+ __LINE__);
+
+ /* vector with write mask */
+ set_nonzero(&d1, 5);
+ set_generic64_result(&d1, 0xf1, 0);
+ _mm512_mask_store_pd(pdst, 0xf1, d1);
+ check_equal_nq(pdst, &et_memory, 8, "_mm512_mask_store_pd - full vector",
+ __LINE__);
+}
+
+int main() {
+ do_store_ps();
+ do_store_epi32();
+ do_store_epi64();
+ do_store_pd();
+
+ if (n_errs != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512F/store.reference_output b/SingleSource/UnitTests/Vector/AVX512F/store.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512F/store.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/CMakeLists.txt b/SingleSource/UnitTests/Vector/AVX512VL/CMakeLists.txt
new file mode 100644
index 00000000..ef2307e8
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/CMakeLists.txt
@@ -0,0 +1,5 @@
+list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${VECTOR_MAIN_DIR})
+list(APPEND LDFLAGS -lm)
+list(APPEND CFLAGS "-march=${X86CPU_ARCH}")
+list(APPEND CFLAGS -fms-extensions)
+llvm_singlesource(PREFIX "Vector-AVX512VL-")
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/Makefile b/SingleSource/UnitTests/Vector/AVX512VL/Makefile
new file mode 100644
index 00000000..2c96e576
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/Makefile
@@ -0,0 +1,11 @@
+# SingleSource/UnitTests/Vector/AVX512VL/Makefile
+
+DIRS =
+LEVEL = ../../../..
+CFLAGS += -fms-extensions -march=native -mavx512vl -I${SourceDir}/..
+LDFLAGS += -lm
+
+include $(LEVEL)/SingleSource/Makefile.singlesrc
+
+TARGET_FLAGS += -march=native -mavx512vl
+LCCFLAGS += -march=native -mavx512vl
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.c b/SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.c
new file mode 100644
index 00000000..1b302b6d
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.c
@@ -0,0 +1,135 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm_mmask_i32gather_epi32()
+ * _mm256_mmask_i32gather_epi32()
+ */
+
+#include <stdio.h>
+#include <x86intrin.h>
+
+#define NUM (256 * 256)
+#define SCALE 4
+
+float dst128_f[NUM];
+float dst256_f[NUM];
+float dst_f[NUM];
+int dst128_i[NUM];
+int dst256_i[NUM];
+int dst_i[NUM];
+float src_f[NUM];
+int src_i[NUM];
+int mask128[NUM / 4];
+int mask256[NUM / 8];
+int g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ g_index[i] = MIN(i * 17 & 0xce, NUM);
+ src_f[g_index[i]] = src_i[g_index[i]] = i;
+
+ dst_i[i] = dst_f[i] = -i;
+ dst128_i[i] = dst256_i[i] = -i;
+ dst128_f[i] = dst256_f[i] = -i;
+
+ if (i % 4 == 0) {
+ mask128[i / 4] = (i * 77) & 0xf;
+ if (i % 8 == 0) {
+ mask256[i / 8] = (i * 31) & 0xff;
+ }
+ }
+ }
+}
+
+void do_mm_mmask_i32gather_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128i old_dst = _mm_loadu_si128((__m128i const *)(dst_i + i));
+ __m128i gtr =
+ _mm_mmask_i32gather_epi32(old_dst, mask128[i / 4], ind, src_i, SCALE);
+ _mm_storeu_si128((__m128i *)(dst128_i + i), gtr);
+ }
+}
+
+void do_mm_mmask_i32gather_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128 old_dst = _mm_loadu_ps(dst_f + i);
+ __m128 gtr =
+ _mm_mmask_i32gather_ps(old_dst, mask128[i / 4], ind, src_f, SCALE);
+ _mm_storeu_ps(dst128_f + i, gtr);
+ }
+}
+
+void do_mm256_mmask_i32gather_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m256i old_dst = _mm256_loadu_si256((const __m256i *)(dst_i + i));
+ __m256i gtr = _mm256_mmask_i32gather_epi32(old_dst, mask256[i / 8], ind,
+ src_i, SCALE);
+ _mm256_storeu_si256((__m256i *)(dst256_i + i), gtr);
+ }
+}
+
+void do_mm256_mmask_i32gather_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m256 old_dst = _mm256_loadu_ps(dst_f + i);
+ __m256 gtr =
+ _mm256_mmask_i32gather_ps(old_dst, mask256[i / 8], ind, src_f, SCALE);
+ _mm256_storeu_ps(dst256_f + i, gtr);
+ }
+}
+
+int check(int id, int *res_dst, int *pass_thru_vals, int *mask, int *src,
+ int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ int kmask = mask[i / elems_in_vector];
+ int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ int v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i];
+
+ if (v != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %d, actual %d\n", v, res_dst[i]);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm_mmask_i32gather_epi32();
+ error |= check(1, dst128_i, dst_i, mask128, src_i, 4);
+
+ do_mm_mmask_i32gather_ps();
+ error |= check(2, (int *)dst128_f, (int *)dst_f, mask128, (int *)src_f, 4);
+
+ do_mm256_mmask_i32gather_epi32();
+ error |= check(3, dst256_i, dst_i, mask256, src_i, 8);
+
+ do_mm256_mmask_i32gather_ps();
+ error |= check(4, (int *)dst256_f, (int *)dst_f, mask256, (int *)src_f, 8);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.reference_output b/SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i32gather_32.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.c b/SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.c
new file mode 100644
index 00000000..f94de6ae
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.c
@@ -0,0 +1,143 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm_mmask_i32gather_epi64()
+ * _mm256_mmask_i32gather_epi64()
+ */
+
+#include <x86intrin.h>
+#include <stdio.h>
+
+#define NUM (256 * 256)
+#define SCALE 8
+
+double dst128_f[NUM];
+double dst256_f[NUM];
+double dst_f[NUM];
+__int64 dst128_i[NUM];
+__int64 dst256_i[NUM];
+__int64 dst_i[NUM];
+double src_f[NUM];
+__int64 src_i[NUM];
+__int64 mask128[NUM / 2];
+__int64 mask256[NUM / 4];
+int g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ g_index[i] = MIN(i * 17 & 0xce, NUM);
+ src_f[g_index[i]] = src_i[g_index[i]] = i;
+
+ dst_i[i] = dst_f[i] = -i;
+ dst128_i[i] = dst256_i[i] = -i;
+ dst128_f[i] = dst256_f[i] = -i;
+
+ if (i % 2 == 0) {
+ mask128[i / 2] = (i * 77) & 0xf;
+ if (i % 4 == 0) {
+ mask256[i / 4] = (i * 31) & 0xff;
+ }
+ }
+ }
+}
+
+void do_mm_mmask_i32gather_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+
+ // Only the low 2 int32 elements should be used.
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+
+ __m128i old_dst = _mm_loadu_si128((__m128i const *)(dst_i + i));
+ __m128i gtr =
+ _mm_mmask_i32gather_epi64(old_dst, mask128[i / 2], ind, src_i, SCALE);
+ _mm_storeu_si128((__m128i *)(dst128_i + i), gtr);
+ }
+}
+
+void do_mm_mmask_i32gather_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+
+ // Only the low 2 int32 elements should be used.
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+
+ __m128d old_dst = _mm_loadu_pd(dst_f + i);
+ __m128d gtr =
+ _mm_mmask_i32gather_pd(old_dst, mask128[i / 2], ind, src_f, SCALE);
+ _mm_storeu_pd(dst128_f + i, gtr);
+ }
+}
+
+void do_mm256_mmask_i32gather_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m256i old_dst = _mm256_loadu_si256((const __m256i *)(dst_i + i));
+ __m256i gtr = _mm256_mmask_i32gather_epi64(old_dst, mask256[i / 4], ind,
+ src_i, SCALE);
+ _mm256_storeu_si256((__m256i *)(dst256_i + i), gtr);
+ }
+}
+
+void do_mm256_mmask_i32gather_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m256d old_dst = _mm256_loadu_pd(dst_f + i);
+ __m256d gtr =
+ _mm256_mmask_i32gather_pd(old_dst, mask256[i / 4], ind, src_f, SCALE);
+ _mm256_storeu_pd(dst256_f + i, gtr);
+ }
+}
+
+int check(int id, __int64 *res_dst, __int64 *pass_thru_vals, __int64 *mask,
+ __int64 *src, int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ __int64 kmask = mask[i / elems_in_vector];
+ __int64 kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ __int64 v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i];
+
+ if (v != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %lld, actual %lld\n", v, res_dst[i]);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm_mmask_i32gather_epi64();
+ error |= check(1, dst128_i, dst_i, mask128, src_i, 2);
+
+ do_mm_mmask_i32gather_pd();
+ error |= check(2, (__int64 *)dst128_f, (__int64 *)dst_f, mask128,
+ (__int64 *)src_f, 2);
+
+ do_mm256_mmask_i32gather_epi64();
+ error |= check(3, dst256_i, dst_i, mask256, src_i, 4);
+
+ do_mm256_mmask_i32gather_pd();
+ error |= check(4, (__int64 *)dst256_f, (__int64 *)dst_f, mask256,
+ (__int64 *)src_f, 4);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.reference_output b/SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i32gather_64.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.c b/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.c
new file mode 100644
index 00000000..b0d74b61
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.c
@@ -0,0 +1,136 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm_mmask_i32gather_epi32()
+ * _mm256_mmask_i32gather_epi32()
+ */
+
+#include <x86intrin.h>
+#include <stdio.h>
+
+#define NUM (256 * 256)
+#define SCALE 4
+
+float dst128_f[NUM], gold_dst128_f[NUM];
+float dst256_f[NUM], gold_dst256_f[NUM];
+int dst128_i[NUM], gold_dst128_i[NUM];
+int dst256_i[NUM], gold_dst256_i[NUM];
+int mask128[NUM / 4];
+int mask256[NUM / 8];
+int g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ dst128_i[i] = dst256_i[i] = -1;
+ dst128_f[i] = dst256_f[i] = -1;
+
+ g_index[i] = i * 2;
+ if (g_index[i] >= NUM) {
+ g_index[i] = NUM - 1 - (i - NUM / 2) * 2;
+ }
+
+ if (i % 4 == 0) {
+ mask128[i / 4] = (i * 77) & 0xf;
+ if (i % 8 == 0) {
+ mask256[i / 8] = (i * 31) & 0xff;
+ }
+ }
+
+ if ((mask128[i / 4] >> (i % 4)) & 0x1) {
+ gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = i;
+ } else {
+ gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = -1;
+ }
+
+ if ((mask256[i / 8] >> (i % 8)) & 0x1) {
+ gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = i;
+ } else {
+ gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = -1;
+ }
+ }
+}
+
+void do_mm_mask_i32scatter_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128i val = _mm_set_epi32(i + 3, i + 2, i + 1, i);
+ _mm_mask_i32scatter_epi32(dst128_i, mask128[i / 4], ind, val, SCALE);
+ }
+}
+
+void do_mm_mask_i32scatter_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128 val = _mm_set_ps(i + 3, i + 2, i + 1, i);
+ _mm_mask_i32scatter_ps(dst128_f, mask128[i / 4], ind, val, SCALE);
+ }
+}
+
+void do_mm256_mask_i32scatter_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m256i val =
+ _mm256_set_epi32(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm256_mask_i32scatter_epi32(dst256_i, mask256[i / 8], ind, val, SCALE);
+ }
+}
+
+void do_mm256_mask_i32scatter_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 8) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m256 val =
+ _mm256_set_ps(i + 7, i + 6, i + 5, i + 4, i + 3, i + 2, i + 1, i);
+ _mm256_mask_i32scatter_ps(dst256_f, mask256[i / 8], ind, val, SCALE);
+ }
+}
+
+int check(int id, int *res_dst, int *gold_dst, int *mask, int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ int kmask = mask[i / elems_in_vector];
+ int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ if (gold_dst[i] != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %d, actual %d, kmask=%d\n", gold_dst[i],
+ res_dst[i], kmask_bit);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm_mask_i32scatter_epi32();
+ error |= check(1, dst128_i, gold_dst128_i, mask128, 4);
+
+ do_mm_mask_i32scatter_ps();
+ error |= check(2, (int *)dst128_f, (int *)gold_dst128_f, mask128, 4);
+
+ do_mm256_mask_i32scatter_epi32();
+ error |= check(3, dst256_i, gold_dst256_i, mask256, 8);
+
+ do_mm256_mask_i32scatter_ps();
+ error |= check(4, (int *)dst256_f, (int *)gold_dst256_f, mask256, 8);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.reference_output b/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_32.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.c b/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.c
new file mode 100644
index 00000000..0e8d61e8
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.c
@@ -0,0 +1,141 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm_mmask_i32gather_epi32()
+ * _mm256_mmask_i32gather_epi32()
+ */
+
+#include <x86intrin.h>
+#include <stdio.h>
+
+#define NUM (256 * 256)
+#define SCALE 8
+
+double dst128_f[NUM], gold_dst128_f[NUM];
+double dst256_f[NUM], gold_dst256_f[NUM];
+__int64 dst128_i[NUM], gold_dst128_i[NUM];
+__int64 dst256_i[NUM], gold_dst256_i[NUM];
+int mask128[NUM / 2];
+int mask256[NUM / 4];
+int g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ dst128_i[i] = dst256_i[i] = -1;
+ dst128_f[i] = dst256_f[i] = -1;
+
+ g_index[i] = i * 2;
+ if (g_index[i] >= NUM) {
+ g_index[i] = NUM - 1 - (i - NUM / 2) * 2;
+ }
+
+ if (i % 2 == 0) {
+ mask128[i / 2] = (i * 77) & 0xf;
+ if (i % 4 == 0) {
+ mask256[i / 4] = (i * 31) & 0xff;
+ }
+ }
+
+ if ((mask128[i / 2] >> (i % 2)) & 0x1) {
+ gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = i;
+ } else {
+ gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = -1;
+ }
+
+ if ((mask256[i / 4] >> (i % 4)) & 0x1) {
+ gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = i;
+ } else {
+ gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = -1;
+ }
+ }
+}
+
+void do_mm_mask_i32scatter_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+
+ // Only 2 low int32 elements are going to be used.
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+
+ __m128i val = _mm_set_epi64x(i + 1, i);
+ _mm_mask_i32scatter_epi64(dst128_i, mask128[i / 2], ind, val, SCALE);
+ }
+}
+
+void do_mm_mask_i32scatter_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+
+ // Only 2 low int32 elements are going to be used.
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+
+ __m128d val = _mm_set_pd(i + 1, i);
+ _mm_mask_i32scatter_pd(dst128_f, mask128[i / 2], ind, val, SCALE);
+ }
+}
+
+void do_mm256_mask_i32scatter_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m256i val = _mm256_set_epi64x(i + 3, i + 2, i + 1, i);
+ _mm256_mask_i32scatter_epi64(dst256_i, mask256[i / 4], ind, val, SCALE);
+ }
+}
+
+void do_mm256_mask_i32scatter_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m256d val = _mm256_set_pd(i + 3, i + 2, i + 1, i);
+ _mm256_mask_i32scatter_pd(dst256_f, mask256[i / 4], ind, val, SCALE);
+ }
+}
+
+int check(int id, __int64 *res_dst, __int64 *gold_dst, int *mask,
+ int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ int kmask = mask[i / elems_in_vector];
+ int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ if (gold_dst[i] != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %lld, actual %lld, kmask=%d\n", gold_dst[i],
+ res_dst[i], kmask_bit);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm_mask_i32scatter_epi64();
+ error |= check(1, dst128_i, gold_dst128_i, mask128, 2);
+
+ do_mm_mask_i32scatter_pd();
+ error |= check(2, (__int64 *)dst128_f, (__int64 *)gold_dst128_f, mask128, 2);
+
+ do_mm256_mask_i32scatter_epi64();
+ error |= check(3, dst256_i, gold_dst256_i, mask256, 4);
+
+ do_mm256_mask_i32scatter_pd();
+ error |= check(4, (__int64 *)dst256_f, (__int64 *)gold_dst256_f, mask256, 4);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.reference_output b/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i32scatter_64.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c
new file mode 100644
index 00000000..b510ab68
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c
@@ -0,0 +1,137 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm_mmask_i32gather_epi32()
+ * _mm256_mmask_i32gather_epi32()
+ */
+
+#include <x86intrin.h>
+#include <stdio.h>
+
+#define NUM (256 * 256)
+#define SCALE 4
+
+float dst128_f[NUM];
+float dst256_f[NUM];
+float dst_f[NUM];
+int dst128_i[NUM];
+int dst256_i[NUM];
+int dst_i[NUM];
+float src_f[NUM];
+int src_i[NUM];
+int mask128[NUM / 2];
+int mask256[NUM / 4];
+__int64 g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ g_index[i] = MIN(i * 17 & 0xce, NUM);
+ src_f[g_index[i]] = src_i[g_index[i]] = i;
+
+ dst_i[i] = dst_f[i] = -i;
+ dst128_i[i] = dst256_i[i] = -i;
+ dst128_f[i] = dst256_f[i] = -i;
+
+ if (i % 2 == 0) {
+ mask128[i / 2] = (i * 77) & 0xf;
+ if (i % 4 == 0) {
+ mask256[i / 4] = (i * 31) & 0xff;
+ }
+ }
+ }
+}
+
+void do_mm_mmask_i64gather_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128i old_dst = _mm_loadu_si128((__m128i const *)(dst_i + i));
+ __m128i gtr =
+ _mm_mmask_i64gather_epi32(old_dst, mask128[i / 2], ind, src_i, SCALE);
+ _mm_storeu_si128((__m128i *)(dst128_i + i), gtr);
+ }
+}
+
+void do_mm_mmask_i64gather_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128 old_dst = _mm_loadu_ps(dst_f + i);
+ __m128 gtr =
+ _mm_mmask_i64gather_ps(old_dst, mask128[i / 2], ind, src_f, SCALE);
+ _mm_storeu_ps(dst128_f + i, gtr);
+ }
+}
+
+void do_mm256_mmask_i64gather_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m128i old_dst = _mm_loadu_si128((const __m128i *)(dst_i + i));
+ __m128i gtr = _mm256_mmask_i64gather_epi32(old_dst, mask256[i / 4], ind,
+ src_i, SCALE);
+ _mm_storeu_si128((__m128i *)(dst256_i + i), gtr);
+ }
+}
+
+void do_mm256_mmask_i64gather_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m128 old_dst = _mm_loadu_ps(dst_f + i);
+ __m128 gtr =
+ _mm256_mmask_i64gather_ps(old_dst, mask256[i / 4], ind, src_f, SCALE);
+ _mm_storeu_ps(dst256_f + i, gtr);
+ }
+}
+
+int check(int id, int *res_dst, int *pass_thru_vals, int *mask, int *src,
+ int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ int kmask = mask[i / elems_in_vector];
+ int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ int v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i];
+ // printf("v= %d, g_index[i] = %d, src[g_index[i]]= %d\n ", v, g_index[i],
+ // src[g_index[i]]);
+
+ if (v != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %d, actual %d\n", v, res_dst[i]);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm_mmask_i64gather_epi32();
+ error |= check(1, dst128_i, dst_i, mask128, src_i, 2);
+
+ do_mm_mmask_i64gather_ps();
+ error |= check(2, (int *)dst128_f, (int *)dst_f, mask128, (int *)src_f, 2);
+
+ do_mm256_mmask_i64gather_epi32();
+ error |= check(3, dst256_i, dst_i, mask256, src_i, 4);
+
+ do_mm256_mmask_i64gather_ps();
+ error |= check(4, (int *)dst256_f, (int *)dst_f, mask256, (int *)src_f, 4);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.reference_output b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.c b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.c
new file mode 100644
index 00000000..adbc5b31
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.c
@@ -0,0 +1,139 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm_mmask_i64gather_epi64()
+ * _mm256_mmask_i64gather_epi64()
+ */
+
+#include <x86intrin.h>
+#include <stdio.h>
+
+#define NUM (256 * 256)
+#define SCALE 8
+
+double dst128_f[NUM];
+double dst256_f[NUM];
+double dst_f[NUM];
+__int64 dst128_i[NUM];
+__int64 dst256_i[NUM];
+__int64 dst_i[NUM];
+double src_f[NUM];
+__int64 src_i[NUM];
+__int64 mask128[NUM / 2];
+__int64 mask256[NUM / 4];
+__int64 g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ g_index[i] = MIN(i * 17 & 0xce, NUM);
+ src_f[g_index[i]] = src_i[g_index[i]] = i;
+
+ dst_i[i] = dst_f[i] = -i;
+ dst128_i[i] = dst256_i[i] = -i;
+ dst128_f[i] = dst256_f[i] = -i;
+
+ if (i % 2 == 0) {
+ mask128[i / 2] = (i * 77) & 0xf;
+ if (i % 4 == 0) {
+ mask256[i / 4] = (i * 31) & 0xff;
+ }
+ }
+ }
+}
+
+void do_mm_mmask_i64gather_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128i old_dst = _mm_loadu_si128((__m128i const *)(dst_i + i));
+ __m128i gtr =
+ _mm_mmask_i64gather_epi64(old_dst, mask128[i / 2], ind, src_i, SCALE);
+ _mm_storeu_si128((__m128i *)(dst128_i + i), gtr);
+ }
+}
+
+void do_mm_mmask_i64gather_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128d old_dst = _mm_loadu_pd(dst_f + i);
+ __m128d gtr =
+ _mm_mmask_i64gather_pd(old_dst, mask128[i / 2], ind, src_f, SCALE);
+ _mm_storeu_pd(dst128_f + i, gtr);
+ }
+}
+
+void do_mm256_mmask_i64gather_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m256i old_dst = _mm256_loadu_si256((const __m256i *)(dst_i + i));
+ __m256i gtr = _mm256_mmask_i64gather_epi64(old_dst, mask256[i / 4], ind,
+ src_i, SCALE);
+ _mm256_storeu_si256((__m256i *)(dst256_i + i), gtr);
+ }
+}
+
+void do_mm256_mmask_i64gather_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m256d old_dst = _mm256_loadu_pd(dst_f + i);
+ __m256d gtr =
+ _mm256_mmask_i64gather_pd(old_dst, mask256[i / 4], ind, src_f, SCALE);
+ _mm256_storeu_pd(dst256_f + i, gtr);
+ }
+}
+
+int check(int id, __int64 *res_dst, __int64 *pass_thru_vals, __int64 *mask,
+ __int64 *src, int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ __int64 kmask = mask[i / elems_in_vector];
+ __int64 kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ __int64 v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i];
+ // printf("v= %d, g_index[i] = %d, src[g_index[i]]= %d\n ", v, g_index[i],
+ // src[g_index[i]]);
+
+ if (v != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %lld, actual %lld\n", v, res_dst[i]);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm_mmask_i64gather_epi64();
+ error |= check(1, dst128_i, dst_i, mask128, src_i, 2);
+
+ do_mm_mmask_i64gather_pd();
+ error |= check(2, (__int64 *)dst128_f, (__int64 *)dst_f, mask128,
+ (__int64 *)src_f, 2);
+
+ do_mm256_mmask_i64gather_epi64();
+ error |= check(3, dst256_i, dst_i, mask256, src_i, 4);
+
+ do_mm256_mmask_i64gather_pd();
+ error |= check(4, (__int64 *)dst256_f, (__int64 *)dst_f, mask256,
+ (__int64 *)src_f, 4);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.reference_output b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_64.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.c b/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.c
new file mode 100644
index 00000000..32ecc9ac
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.c
@@ -0,0 +1,139 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm_mmask_i32gather_epi32()
+ * _mm256_mmask_i32gather_epi32()
+ */
+
+#include <x86intrin.h>
+#include <stdio.h>
+
+#define NUM (256 * 256)
+#define SCALE 4
+
+float dst128_f[NUM], gold_dst128_f[NUM];
+float dst256_f[NUM], gold_dst256_f[NUM];
+int dst128_i[NUM], gold_dst128_i[NUM];
+int dst256_i[NUM], gold_dst256_i[NUM];
+int mask128[NUM / 2];
+int mask256[NUM / 4];
+__int64 g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ dst128_i[i] = dst256_i[i] = -1;
+ dst128_f[i] = dst256_f[i] = -1;
+
+ g_index[i] = i * 2;
+ if (g_index[i] >= NUM) {
+ g_index[i] = NUM - 1 - (i - NUM / 2) * 2;
+ }
+
+ if (i % 2 == 0) {
+ mask128[i / 2] = (i * 77) & 0xf;
+ if (i % 4 == 0) {
+ mask256[i / 4] = (i * 31) & 0xff;
+ }
+ }
+
+ if ((mask128[i / 2] >> (i % 2)) & 0x1) {
+ gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = i;
+ } else {
+ gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = -1;
+ }
+
+ if ((mask256[i / 4] >> (i % 4)) & 0x1) {
+ gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = i;
+ } else {
+ gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = -1;
+ }
+ }
+}
+
+void do_mm_mask_i64scatter_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+
+ // Only the low 2 int32 values are going to be used.
+ __m128i val = _mm_set_epi32(0, 0, i + 1, i);
+
+ _mm_mask_i64scatter_epi32(dst128_i, mask128[i / 2], ind, val, SCALE);
+ }
+}
+
+void do_mm_mask_i64scatter_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+
+ // Only the low 2 int32 values are going to be used.
+ __m128 val = _mm_set_ps(0, 0, i + 1, i);
+
+ _mm_mask_i64scatter_ps(dst128_f, mask128[i / 2], ind, val, SCALE);
+ }
+}
+
+void do_mm256_mask_i64scatter_epi32() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m128i val = _mm_set_epi32(i + 3, i + 2, i + 1, i);
+ _mm256_mask_i64scatter_epi32(dst256_i, mask256[i / 4], ind, val, SCALE);
+ }
+}
+
+void do_mm256_mask_i64scatter_ps() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m128 val = _mm_set_ps(i + 3, i + 2, i + 1, i);
+ _mm256_mask_i64scatter_ps(dst256_f, mask256[i / 4], ind, val, SCALE);
+ }
+}
+
+int check(int id, int *res_dst, int *gold_dst, int *mask, int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ int kmask = mask[i / elems_in_vector];
+ int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ if (gold_dst[i] != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %d, actual %d, kmask=%d\n", gold_dst[i],
+ res_dst[i], kmask_bit);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm_mask_i64scatter_epi32();
+ error |= check(1, dst128_i, gold_dst128_i, mask128, 2);
+
+ do_mm_mask_i64scatter_ps();
+ error |= check(2, (int *)dst128_f, (int *)gold_dst128_f, mask128, 2);
+
+ do_mm256_mask_i64scatter_epi32();
+ error |= check(3, dst256_i, gold_dst256_i, mask256, 4);
+
+ do_mm256_mask_i64scatter_ps();
+ error |= check(4, (int *)dst256_f, (int *)gold_dst256_f, mask256, 4);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.reference_output b/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_32.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.c b/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.c
new file mode 100644
index 00000000..069248e4
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.c
@@ -0,0 +1,135 @@
+/*
+ * This test was created to check the correctness
+ * of the following intrinsics support:
+ * _mm_mmask_i32gather_epi32()
+ * _mm256_mmask_i32gather_epi32()
+ */
+
+#include <x86intrin.h>
+#include <stdio.h>
+
+#define NUM (256 * 256)
+#define SCALE 8
+
+double dst128_f[NUM], gold_dst128_f[NUM];
+double dst256_f[NUM], gold_dst256_f[NUM];
+__int64 dst128_i[NUM], gold_dst128_i[NUM];
+__int64 dst256_i[NUM], gold_dst256_i[NUM];
+int mask128[NUM / 2];
+int mask256[NUM / 4];
+__int64 g_index[NUM];
+
+#define MIN(x, y) ((x) <= (y) ? (x) : (y))
+
+void init_data() {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ dst128_i[i] = dst256_i[i] = -1;
+ dst128_f[i] = dst256_f[i] = -1;
+
+ g_index[i] = i * 2;
+ if (g_index[i] >= NUM) {
+ g_index[i] = NUM - 1 - (i - NUM / 2) * 2;
+ }
+
+ if (i % 2 == 0) {
+ mask128[i / 2] = (i * 77) & 0xf;
+ if (i % 4 == 0) {
+ mask256[i / 4] = (i * 31) & 0xff;
+ }
+ }
+
+ if ((mask128[i / 2] >> (i % 2)) & 0x1) {
+ gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = i;
+ } else {
+ gold_dst128_i[g_index[i]] = gold_dst128_f[g_index[i]] = -1;
+ }
+
+ if ((mask256[i / 4] >> (i % 4)) & 0x1) {
+ gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = i;
+ } else {
+ gold_dst256_i[g_index[i]] = gold_dst256_f[g_index[i]] = -1;
+ }
+ }
+}
+
+void do_mm_mask_i64scatter_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128i val = _mm_set_epi64x(i + 1, i);
+ _mm_mask_i64scatter_epi64(dst128_i, mask128[i / 2], ind, val, SCALE);
+ }
+}
+
+void do_mm_mask_i64scatter_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 2) {
+ __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
+ __m128d val = _mm_set_pd(i + 1, i);
+ _mm_mask_i64scatter_pd(dst128_f, mask128[i / 2], ind, val, SCALE);
+ }
+}
+
+void do_mm256_mask_i64scatter_epi64() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m256i val = _mm256_set_epi64x(i + 3, i + 2, i + 1, i);
+ _mm256_mask_i64scatter_epi64(dst256_i, mask256[i / 4], ind, val, SCALE);
+ }
+}
+
+void do_mm256_mask_i64scatter_pd() {
+ int i;
+ for (i = 0; i < NUM; i += 4) {
+ __m256i ind = _mm256_loadu_si256((const __m256i *)(g_index + i));
+ __m256d val = _mm256_set_pd(i + 3, i + 2, i + 1, i);
+ _mm256_mask_i64scatter_pd(dst256_f, mask256[i / 4], ind, val, SCALE);
+ }
+}
+
+int check(int id, __int64 *res_dst, __int64 *gold_dst, int *mask,
+ int elems_in_vector) {
+ int i;
+ for (i = 0; i < NUM; i++) {
+ int kmask = mask[i / elems_in_vector];
+ int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+ if (gold_dst[i] != res_dst[i]) {
+ printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+ printf("Expected value %lld, actual %lld, kmask=%d\n", gold_dst[i],
+ res_dst[i], kmask_bit);
+
+ return -1;
+ }
+ }
+ return 0;
+}
+
+int main() {
+ int error = 0;
+
+ init_data();
+
+ do_mm_mask_i64scatter_epi64();
+ error |= check(1, dst128_i, gold_dst128_i, mask128, 2);
+
+ do_mm_mask_i64scatter_pd();
+ error |= check(2, (__int64 *)dst128_f, (__int64 *)gold_dst128_f, mask128, 2);
+
+ do_mm256_mask_i64scatter_epi64();
+ error |= check(3, dst256_i, gold_dst256_i, mask256, 4);
+
+ do_mm256_mask_i64scatter_pd();
+ error |= check(4, (__int64 *)dst256_f, (__int64 *)gold_dst256_f, mask256, 4);
+
+ if (error != 0) {
+ printf("FAILED\n");
+ return 1;
+ }
+
+ printf("PASSED\n");
+ return 0;
+}
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.reference_output b/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.reference_output
new file mode 100644
index 00000000..bfae62d0
--- /dev/null
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i64scatter_64.reference_output
@@ -0,0 +1,2 @@
+PASSED
+exit 0
diff --git a/SingleSource/UnitTests/Vector/CMakeLists.txt b/SingleSource/UnitTests/Vector/CMakeLists.txt
index 54d7634d..5d51e4c2 100644
--- a/SingleSource/UnitTests/Vector/CMakeLists.txt
+++ b/SingleSource/UnitTests/Vector/CMakeLists.txt
@@ -17,8 +17,11 @@ if(CMAKE_C_COMPILER_ID STREQUAL "Clang")
if(ARCH STREQUAL "x86")
if(X86CPU_ARCH STREQUAL "skylake-avx512")
add_subdirectory(AVX512F)
+ add_subdirectory(AVX512BW)
add_subdirectory(AVX512BWVL)
add_subdirectory(AVX512DQ)
+ add_subdirectory(AVX512DQVL)
+ add_subdirectory(AVX512VL)
endif()
if(X86CPU_ARCH STREQUAL "knl")
add_subdirectory(AVX512F)
diff --git a/SingleSource/UnitTests/Vector/Makefile b/SingleSource/UnitTests/Vector/Makefile
index 671634ab..723dddd6 100644
--- a/SingleSource/UnitTests/Vector/Makefile
+++ b/SingleSource/UnitTests/Vector/Makefile
@@ -17,13 +17,26 @@ DIRS += SSE
endif
ifeq ($(CC_UNDER_TEST_IS_CLANG), 1)
+
ifeq ($(HAVE_X86_AVX512F_INSTRUCTIONS), 1)
DIRS += AVX512F
endif
+
+ifeq ($(HAVE_X86_AVX512VL_INSTRUCTIONS), 1)
+DIRS += AVX512VL
ifeq ($(HAVE_X86_AVX512BW_INSTRUCTIONS), 1)
DIRS += AVX512BWVL
endif
ifeq ($(HAVE_X86_AVX512DQ_INSTRUCTIONS), 1)
+DIRS += AVX512DQVL
+endif
+endif
+
+ifeq ($(HAVE_X86_AVX512BW_INSTRUCTIONS), 1)
+DIRS += AVX512BW
+endif
+
+ifeq ($(HAVE_X86_AVX512DQ_INSTRUCTIONS), 1)
DIRS += AVX512DQ
endif
endif