diff options
Diffstat (limited to 'SingleSource/UnitTests/Vector/AVX512F/math.c')
-rw-r--r-- | SingleSource/UnitTests/Vector/AVX512F/math.c | 696 |
1 files changed, 696 insertions, 0 deletions
diff --git a/SingleSource/UnitTests/Vector/AVX512F/math.c b/SingleSource/UnitTests/Vector/AVX512F/math.c new file mode 100644 index 00000000..2affc18a --- /dev/null +++ b/SingleSource/UnitTests/Vector/AVX512F/math.c @@ -0,0 +1,696 @@ +/* + * Test math instructions: sqrt, reciprocal, floor, ceil, exponent, + * scale, fixup ,roundscale and ternary logic. + * Here we check for _mm512_[mask|maskz]_[ceil|floor|scalef|sqrt|ternarylogic] + * intrinsics. + */ +#include "m512_test_util.h" +#include <math.h> +#include <stdio.h> +#include <string.h> + +volatile int vol0 = 0; + +V512 i32; +V512 i32_squares; +V512 i32_neg; +V512 i64; +V512 i64_squares; +V512 i64_neg; +V512 f32; +V512 f32_squares; +V512 f32_halves; +V512 f64; +V512 f64_squares; +V512 f64_halves; + +void NOINLINE init() { + volatile int i; + + for (i = 0; i < 16; i++) { + i32.s32[i] = i; + i32_squares.s32[i] = i * i; + i32_neg.s32[i] = -i; + f32.f32[i] = i; + f32_squares.f32[i] = i * i; + f32_halves.f32[i] = i + 0.5f; + } + + for (i = 0; i < 8; i++) { + i64.s64[i] = i; + i64_squares.s64[i] = i * i; + i64_neg.s64[i] = -i; + f64.f64[i] = i; + f64_squares.f64[i] = i * i; + f64_halves.f64[i] = i + 0.5; + } +} + +void NOINLINE do_rcp14pd() { + volatile V512 res; + V512 expected; + __mmask8 k = 0xc3; + + res.zmmd = _mm512_rcp14_pd(f64.zmmd); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_rcp14_pd(res.zmmd, k, f64.zmmd); +} + +void NOINLINE do_rcp14ps() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x7e95; + + res.zmm = _mm512_rcp14_ps(f32.zmm); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_rcp14_ps(res.zmm, k, f32.zmm); +} + +void NOINLINE do_sqrtps() { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmm = _mm512_sqrt_ps(f32_squares.zmm); + for (i = 0; i < 16; i++) { + expected.f32[i] = i; + } + check_equal_nd(&res, &expected, 16, "_mm512_sqrt_ps", __LINE__); + + f32_squares.xmm[vol0] = f32_squares.xmm[vol0]; /* No-op. */ + + k = 0xbcdf; + res.zmm = _mm512_setzero_ps(); + res.zmm = _mm512_mask_sqrt_ps(res.zmm, k, f32_squares.zmm); + expected.zmm = _mm512_setzero_ps(); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.f32[i] = i; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_sqrt_ps", __LINE__); +} + +void NOINLINE do_sqrtpd() { + V512 res; + V512 expected; + __mmask8 k; + volatile int i; + + res.zmmd = _mm512_sqrt_pd(f64_squares.zmmd); + for (i = 0; i < 8; i++) { + expected.f64[i] = i; + } + check_equal_nd(&res, &expected, 16, "_mm512_sqrt_pd", __LINE__); + + f64_squares.xmmd[vol0] = f64_squares.xmmd[vol0]; /* No-op. */ + + k = 0xe9; + res.zmmd = _mm512_setzero_pd(); + res.zmmd = _mm512_mask_sqrt_pd(res.zmmd, k, f64_squares.zmmd); + expected.zmmd = _mm512_setzero_pd(); + for (i = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.f64[i] = i; + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_mask_sqrt_pd", __LINE__); +} + +void NOINLINE do_floorps() { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmm = _mm512_floor_ps(f32_halves.zmm); + for (i = 0; i < 16; i++) { + expected.f32[i] = i; + } + check_equal_nd(&res, &expected, 16, "_mm512_floor_ps", __LINE__); + + f32_halves.xmm[vol0] = f32_halves.xmm[vol0]; /* No-op. */ + + k = 0xbcdf; + res.zmm = _mm512_setzero_ps(); + res.zmm = _mm512_mask_floor_ps(res.zmm, k, f32_halves.zmm); + expected.zmm = _mm512_setzero_ps(); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.f32[i] = i; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_floor_ps", __LINE__); +} + +void NOINLINE do_floorpd() { + V512 res; + V512 expected; + __mmask8 k; + volatile int i; + + res.zmmd = _mm512_floor_pd(f64_halves.zmmd); + for (i = 0; i < 8; i++) { + expected.f64[i] = i; + } + check_equal_nd(&res, &expected, 16, "_mm512_floor_pd", __LINE__); + + f64_halves.xmmd[vol0] = f64_halves.xmmd[vol0]; /* No-op. */ + + k = 0x7b; + res.zmmd = _mm512_setzero_pd(); + res.zmmd = _mm512_mask_floor_pd(res.zmmd, k, f64_halves.zmmd); + expected.zmmd = _mm512_setzero_pd(); + for (i = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.f64[i] = i; + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_mask_floor_pd", __LINE__); +} + +void NOINLINE do_ceilps() { + V512 res; + V512 expected; + __mmask16 k; + volatile int i; + + res.zmm = _mm512_ceil_ps(f32_halves.zmm); + for (i = 0; i < 16; i++) { + expected.f32[i] = i + 1; + } + check_equal_nd(&res, &expected, 16, "_mm512_ceil_ps", __LINE__); + + f32_halves.xmm[vol0] = f32_halves.xmm[vol0]; /* No-op. */ + + k = 0xbcdf; + res.zmm = _mm512_setzero_ps(); + res.zmm = _mm512_mask_ceil_ps(res.zmm, k, f32_halves.zmm); + expected.zmm = _mm512_setzero_ps(); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.f32[i] = i + 1; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_ceil_ps", __LINE__); +} + +void NOINLINE do_ceilpd() { + V512 res; + V512 expected; + __mmask8 k; + volatile int i; + + res.zmmd = _mm512_ceil_pd(f64_halves.zmmd); + for (i = 0; i < 8; i++) { + expected.f64[i] = i + 1; + } + check_equal_nd(&res, &expected, 16, "_mm512_ceil_pd", __LINE__); + + f64_halves.xmmd[vol0] = f64_halves.xmmd[vol0]; /* No-op. */ + + k = 0x7b; + res.zmmd = _mm512_setzero_pd(); + res.zmmd = _mm512_mask_ceil_pd(res.zmmd, k, f64_halves.zmmd); + expected.zmmd = _mm512_setzero_pd(); + for (i = 0; i < 8; i++) { + if (k & (1 << i)) { + expected.f64[i] = i + 1; + } + } + + check_equal_nd(&res, &expected, 16, "_mm512_mask_ceil_pd", __LINE__); +} + +void NOINLINE do_getexpsd() { + __mmask8 k8 = 0x2; + volatile __m128d res; + volatile __m128d v1 = _mm_set_pd(8.0, 32.0); + volatile __m128d v2 = _mm_set_pd(16.0, 64.0); + + __m128d res_exp_nomask = _mm_set_pd(8.0, 6.0); + __m128d res_exp_mask = _mm_set_pd(8.0, 32.0); + __m128d res_exp_maskz = _mm_set_pd(8.0, 0.0); + + res = _mm_setzero_pd(); + res = _mm_getexp_sd(v1, v2); + + check_equal_ndf((void *)&res, (void *)&res_exp_nomask, 2, "_mm_getexp_sd", + __LINE__); + + res = _mm_setzero_pd(); + res = _mm_mask_getexp_sd(v1, k8, v1, v2); + check_equal_ndf((void *)&res, (void *)&res_exp_mask, 2, "_mm_mask_getexp_sd", + __LINE__); + + res = _mm_setzero_pd(); + res = _mm_maskz_getexp_sd(k8, v1, v2); + check_equal_ndf((void *)&res, (void *)&res_exp_maskz, 2, + "_mm_maskz_getexp_sd", __LINE__); +} + +void NOINLINE do_getexpss() { + __mmask8 k8 = 0xe; + volatile __m128 res; + volatile __m128 v1 = _mm_set_ps(16.0f, 32.0f, 64.0f, 128.0f); + volatile __m128 v2 = _mm_set_ps(128.0f, 256.0f, 512.0f, 1024.0f); + + volatile __m128 res_exp_nomask = _mm_set_ps(16.0f, 32.0f, 64.0f, 10.0f); + volatile __m128 res_exp_mask = _mm_set_ps(16.0f, 32.0f, 64.0f, 128.0f); + volatile __m128 res_exp_maskz = _mm_set_ps(16.0f, 32.0f, 64.0f, 0.0f); + + res = _mm_setzero_ps(); + res = _mm_getexp_ss(v1, v2); + check_equal_nsf((void *)&res, (void *)&res_exp_nomask, 4, "_mm_getexp_ss", + __LINE__); + + res = _mm_setzero_ps(); + res = _mm_mask_getexp_ss(v1, k8, v1, v2); + check_equal_nsf((void *)&res, (void *)&res_exp_mask, 2, "_mm_mask_getexp_ss", + __LINE__); + + res = _mm_setzero_ps(); + res = _mm_maskz_getexp_ss(k8, v1, v2); + check_equal_nsf((void *)&res, (void *)&res_exp_maskz, 4, + "_mm_maskz_getexp_ss", __LINE__); +} + +void NOINLINE do_getmantpd() { + volatile V512 res; + V512 expected; + __mmask8 k = 0x75; + + res.zmmd = + _mm512_getmant_pd(f64.zmmd, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_getmant_pd(res.zmmd, k, f64.zmmd, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); +} + +void NOINLINE do_getmantps() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x7e95; + + res.zmm = _mm512_getmant_ps(f32.zmm, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_getmant_ps(res.zmm, k, f32.zmm, _MM_MANT_NORM_p5_2, + _MM_MANT_SIGN_zero); +} + +#define CHECK_SCALEFPD(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; i++) { \ + expected.f64[i] = f64.f64[i] * (pow(2.0, floor(f64_squares.f64[i]))); \ + if ((mask & (1 << i)) == 0) { \ + if (zeroing) { \ + expected.f64[i] = 0.0; \ + } else { \ + expected.f64[i] = dest.f64[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems * 2, name, __LINE__); \ + f64.xmmd[vol0] = f64.xmmd[vol0]; \ + } + +void NOINLINE do_scalefpd() { + V512 res; + V512 expected; + __mmask8 k = 0xFF; + + res.zmmd = _mm512_scalef_round_pd(f64.zmmd, f64_squares.zmmd, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_scalef_round_pd"); + + res.zmmd = _mm512_scalef_pd(f64.zmmd, f64_squares.zmmd); + CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_scalef_pd"); + + k = 0x75; + + res.zmmd = _mm512_mask_scalef_round_pd( + f64_halves.zmmd, k, f64.zmmd, f64_squares.zmmd, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_mask_scalef_round_pd"); + + res.zmmd = + _mm512_mask_scalef_pd(f64_halves.zmmd, k, f64.zmmd, f64_squares.zmmd); + CHECK_SCALEFPD(8, f64_halves, k, 0, "_mm512_mask_scalef_pd"); + + k = 0x57; + + res.zmmd = _mm512_maskz_scalef_round_pd( + k, f64.zmmd, f64_squares.zmmd, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPD(8, f64_halves, k, 1, "_mm512_maskz_scalef_round_pd"); + + res.zmmd = _mm512_maskz_scalef_pd(k, f64.zmmd, f64_squares.zmmd); + CHECK_SCALEFPD(8, f64_halves, k, 1, "_mm512_maskz_scalef_pd"); +} + +#define CHECK_SCALEFPS(n_elems, dest, mask, zeroing, name) \ + { \ + volatile int i; \ + for (i = 0; i < n_elems; i++) { \ + expected.f32[i] = f32.f32[i] * (powf(2.0F, floorf(f32_squares.f32[i]))); \ + if ((mask & (1 << i)) == 0) { \ + if (zeroing) { \ + expected.f32[i] = 0.0F; \ + } else { \ + expected.f32[i] = dest.f32[i]; \ + } \ + } \ + } \ + check_equal_nd(&res, &expected, n_elems, name, __LINE__); \ + f32.xmm[vol0] = f32.xmm[vol0]; \ + } + +void NOINLINE do_scalefps() { + V512 res; + V512 expected; + __mmask16 k = 0xFFFF; + + res.zmm = _mm512_scalef_round_ps(f32.zmm, f32_squares.zmm, + _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_scalef_round_ps"); + + res.zmm = _mm512_scalef_ps(f32.zmm, f32_squares.zmm); + CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_scalef_ps"); + + k = 0x0bcd; + + res.zmm = + _mm512_mask_scalef_round_ps(f32_halves.zmm, k, f32.zmm, f32_squares.zmm, + _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_mask_scalef_round_ps"); + + res.zmm = _mm512_mask_scalef_ps(f32_halves.zmm, k, f32.zmm, f32_squares.zmm); + CHECK_SCALEFPS(16, f32_halves, k, 0, "_mm512_mask_scalef_ps"); + + k = 0x0dcb; + + res.zmm = _mm512_maskz_scalef_round_ps( + k, f32.zmm, f32_squares.zmm, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); + CHECK_SCALEFPS(16, f32_halves, k, 1, "_mm512_maskz_scalef_round_ps"); + + res.zmm = _mm512_maskz_scalef_ps(k, f32.zmm, f32_squares.zmm); + CHECK_SCALEFPS(16, f32_halves, k, 1, "_mm512_maskz_scalef_ps"); +} + +#define SOME_ROUND (_MM_FROUND_CUR_DIRECTION) + +void NOINLINE do_fixupimmpd() { + volatile V512 res; + V512 expected; + __mmask8 k = 0x75; + + res.zmmd = _mm512_fixupimm_pd(f64.zmmd, f64_squares.zmmd, i32.zmmi, 0x97); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_fixupimm_pd(res.zmmd, k, f64.zmmd, i32.zmmi, 0xfe); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_maskz_fixupimm_pd(k, res.zmmd, f64.zmmd, i32.zmmi, 0xfe); + + res.zmmd = _mm512_fixupimm_round_pd(f64.zmmd, f64_squares.zmmd, i32.zmmi, + 0x97, SOME_ROUND); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_fixupimm_round_pd(res.zmmd, k, f64.zmmd, i32.zmmi, + 0xfe, SOME_ROUND); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_maskz_fixupimm_round_pd(k, res.zmmd, f64.zmmd, i32.zmmi, + 0xfe, SOME_ROUND); +} + +void NOINLINE do_fixupimmps() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x75; + + res.zmm = _mm512_fixupimm_ps(f32.zmm, f32_squares.zmm, i32.zmmi, 0x97); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_fixupimm_ps(res.zmm, k, f32.zmm, i32.zmmi, 0xfe); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_maskz_fixupimm_ps(k, res.zmm, f32.zmm, i32.zmmi, 0xfe); + + res.zmm = _mm512_fixupimm_round_ps(f32.zmm, f32_squares.zmm, i32.zmmi, 0x97, + SOME_ROUND); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_fixupimm_round_ps(res.zmm, k, f32.zmm, i32.zmmi, 0xfe, + SOME_ROUND); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_maskz_fixupimm_round_ps(k, res.zmm, f32.zmm, i32.zmmi, 0xfe, + SOME_ROUND); +} + +void NOINLINE do_fixupimmsd() { + volatile V512 res; + V512 expected; + + __mmask8 k = 0x75; + + res.xmmd[0] = + _mm_fixupimm_sd(f64.xmmd[0], f64_squares.xmmd[0], i32.xmmi[0], 0x97); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmmd[0] = + _mm_mask_fixupimm_sd(res.xmmd[0], k, f64.xmmd[0], i32.xmmi[0], 0xfe); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmmd[0] = + _mm_maskz_fixupimm_sd(k, res.xmmd[0], f64.xmmd[0], i32.xmmi[0], 0xfe); + + res.xmmd[0] = _mm_fixupimm_round_sd(f64.xmmd[0], f64_squares.xmmd[0], + i32.xmmi[0], 0x97, SOME_ROUND); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmmd[0] = _mm_mask_fixupimm_round_sd(res.xmmd[0], k, f64.xmmd[0], + i32.xmmi[0], 0xfe, SOME_ROUND); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmmd[0] = _mm_maskz_fixupimm_round_sd(k, res.xmmd[0], f64.xmmd[0], + i32.xmmi[0], 0xfe, SOME_ROUND); +} + +void NOINLINE do_fixupimmss() { + volatile V512 res; + V512 expected; + __mmask8 k = 0x75; + + res.xmm[0] = + _mm_fixupimm_ss(f32.xmm[0], f32_squares.xmm[0], i32.xmmi[0], 0x97); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmm[0] = + _mm_mask_fixupimm_ss(res.xmm[0], k, f32.xmm[0], i32.xmmi[0], 0xfe); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmm[0] = + _mm_maskz_fixupimm_ss(k, res.xmm[0], f32.xmm[0], i32.xmmi[0], 0xfe); + + res.xmm[0] = _mm_fixupimm_round_ss(f64.xmm[0], f64_squares.xmm[0], + i32.xmmi[0], 0x97, SOME_ROUND); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmm[0] = _mm_mask_fixupimm_round_ss(res.xmm[0], k, f64.xmm[0], + i32.xmmi[0], 0xfe, SOME_ROUND); + + res.xmmi[0] = _mm_setzero_si128(); + res.xmm[0] = _mm_maskz_fixupimm_round_ss(k, res.xmm[0], f64.xmm[0], + i32.xmmi[0], 0xfe, SOME_ROUND); +} + +void NOINLINE do_roundscalepd() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x3d; + + res.zmmd = _mm512_roundscale_pd(f64.zmmd, 0xff); + + res.zmmi = _mm512_setzero_epi32(); + res.zmmd = _mm512_mask_roundscale_pd(res.zmmd, k, f64.zmmd, 0x36); +} + +void NOINLINE do_roundscaleps() { + volatile V512 res; + V512 expected; + __mmask16 k = 0x74cb; + + res.zmm = _mm512_roundscale_ps(f32.zmm, 0xf7); + + res.zmmi = _mm512_setzero_epi32(); + res.zmm = _mm512_mask_roundscale_ps(res.zmm, k, f32.zmm, 0x36); +} + +static int NOINLINE emulate_ternarylogicd(int a, int b, int c, int imm) { + int i, index, res = 0; + + for (i = 0; i < 32; i++) { + index = ((a & 1) << 2) | ((b & 1) << 1) | (c & 1); + res |= ((imm & (1 << index)) ? 1 : 0) << i; + a >>= 1; + b >>= 1; + c >>= 1; + } + + return res; +} + +void NOINLINE do_pternlogq() { + volatile int i; + V512 res, resx, resy; + V512 expected; + __mmask8 k8 = 0x75; + + res.zmmi = + _mm512_ternarylogic_epi64(i64.zmmi, i64_squares.zmmi, i64.zmmi, 0x79); + for (i = 0; i < 16; i++) { + expected.s32[i] = + emulate_ternarylogicd(i64.s32[i], i64_squares.s32[i], i64.s32[i], 0x79); + } + check_equal_nd(&res, &expected, 16, "_mm512_ternarylogic_epi64", __LINE__); + + i64.xmm[vol0] = i64.xmm[vol0]; /* No-op. */ + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_ternarylogic_epi64(res.zmmi, k8, i64_neg.zmmi, + i64.zmmi, 0xca); + for (i = 0; i < 16; i += 2) { + if (k8 & (1 << (i / 2))) { + expected.s32[i] = + emulate_ternarylogicd(0, i64_neg.s32[i], i64.s32[i], 0xca); + expected.s32[i + 1] = + emulate_ternarylogicd(0, i64_neg.s32[i + 1], i64.s32[i + 1], 0xca); + } else { + expected.s32[i] = 0; + expected.s32[i + 1] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_ternarylogic_epi64", + __LINE__); + + i64.xmm[vol0] = i64.xmm[vol0]; /* No-op. */ + + res.zmmi = _mm512_maskz_ternarylogic_epi64(k8, i64_squares.zmmi, + i64_squares.zmmi, i64.zmmi, 0x3b); + for (i = 0; i < 16; i += 2) { + if (k8 & (1 << (i / 2))) { + expected.s32[i] = emulate_ternarylogicd( + i64_squares.s32[i], i64_squares.s32[i], i64.s32[i], 0x3b); + expected.s32[i + 1] = emulate_ternarylogicd( + i64_squares.s32[i + 1], i64_squares.s32[i + 1], i64.s32[i], 0x3b); + } else { + expected.s32[i] = 0; + expected.s32[i + 1] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_ternarylogic_epi64", + __LINE__); +} + +void NOINLINE do_pternlogd() { + volatile int i; + V512 res, resx, resy; + V512 expected; + __mmask16 k = 0x23bc; + __mmask8 k8 = (__mmask8)k; + + res.zmmi = + _mm512_ternarylogic_epi32(i32.zmmi, i32_squares.zmmi, i32.zmmi, 0x97); + for (i = 0; i < 16; i++) { + expected.s32[i] = + emulate_ternarylogicd(i32.s32[i], i32_squares.s32[i], i32.s32[i], 0x97); + } + check_equal_nd(&res, &expected, 16, "_mm512_ternarylogic_epi32", __LINE__); + + i32.xmm[vol0] = i32.xmm[vol0]; /* No-op. */ + + res.zmmi = _mm512_setzero_epi32(); + res.zmmi = _mm512_mask_ternarylogic_epi32(res.zmmi, k, i32_squares.zmmi, + i32.zmmi, 0xfe); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.s32[i] = + emulate_ternarylogicd(0, i32_squares.s32[i], i32.s32[i], 0xfe); + } else { + expected.s32[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_mask_ternarylogic_epi32", + __LINE__); + + i32.xmm[vol0] = i32.xmm[vol0]; /* No-op. */ + + k = 0xabcd; + k8 = (__mmask8)k; + res.zmmi = _mm512_maskz_ternarylogic_epi32(k, i32_squares.zmmi, i32_neg.zmmi, + i32.zmmi, 0x3b); + for (i = 0; i < 16; i++) { + if (k & (1 << i)) { + expected.s32[i] = emulate_ternarylogicd(i32_squares.s32[i], + i32_neg.s32[i], i32.s32[i], 0x3b); + } else { + expected.s32[i] = 0; + } + } + check_equal_nd(&res, &expected, 16, "_mm512_maskz_ternarylogic_epi32", + __LINE__); +} + +int main(int argc, char *argv[]) { + init(); + + do_rcp14pd(); + do_rcp14ps(); + + do_sqrtps(); + do_sqrtpd(); + + do_floorps(); + do_floorpd(); + + do_ceilps(); + do_ceilpd(); + + do_getexpsd(); + do_getexpss(); + + do_getmantpd(); + do_getmantps(); + + do_scalefpd(); + do_scalefps(); + + do_fixupimmpd(); + do_fixupimmps(); + + do_fixupimmsd(); + do_fixupimmss(); + + do_roundscalepd(); + do_roundscaleps(); + + do_pternlogq(); + do_pternlogd(); + + if (n_errs != 0) { + printf("FAILED\n"); + return 1; + } + + printf("PASSED\n"); + return 0; +} |