aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/X86')
-rw-r--r--test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll1
-rw-r--r--test/CodeGen/X86/2011-10-19-widen_vselect.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/add-scalar.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/brcond.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/callingconv.ll21
-rw-r--r--test/CodeGen/X86/GlobalISel/frameIndex.ll1
-rw-r--r--test/CodeGen/X86/GlobalISel/select-cmp.mir26
-rw-r--r--test/CodeGen/X86/GlobalISel/select-copy.mir6
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir10
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext.mir12
-rw-r--r--test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir2
-rw-r--r--test/CodeGen/X86/O0-pipeline.ll1
-rw-r--r--test/CodeGen/X86/TruncAssertZext.ll1
-rw-r--r--test/CodeGen/X86/avg.ll185
-rw-r--r--test/CodeGen/X86/avx-basic.ll7
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86.ll52
-rw-r--r--test/CodeGen/X86/avx-schedule.ll8
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll10
-rw-r--r--test/CodeGen/X86/avx512-regcall-Mask.ll22
-rw-r--r--test/CodeGen/X86/avx512-regcall-NoMask.ll17
-rwxr-xr-xtest/CodeGen/X86/avx512-schedule.ll4
-rw-r--r--test/CodeGen/X86/avx512-select.ll1
-rwxr-xr-xtest/CodeGen/X86/avx512-shuffle-schedule.ll736
-rw-r--r--test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll368
-rw-r--r--test/CodeGen/X86/avx512-skx-insert-subvec.ll2
-rw-r--r--test/CodeGen/X86/avx512-vbroadcast.ll2
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll2
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll12
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll4
-rw-r--r--test/CodeGen/X86/avx512bw-vec-test-testn.ll32
-rw-r--r--test/CodeGen/X86/avx512bwvl-vec-test-testn.ll64
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll37
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll23
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics.ll22
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll44
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics.ll43
-rw-r--r--test/CodeGen/X86/avx512f-vec-test-testn.ll32
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll30
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll4
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll12
-rw-r--r--test/CodeGen/X86/avx512vl-vbroadcast.ll3
-rw-r--r--test/CodeGen/X86/avx512vl-vec-masked-cmp.ll520
-rw-r--r--test/CodeGen/X86/avx512vl-vec-test-testn.ll128
-rw-r--r--test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll75
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-256.ll1
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-512.ll4
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll6
-rw-r--r--test/CodeGen/X86/bitcast-setcc-256.ll1
-rw-r--r--test/CodeGen/X86/bitcast-setcc-512.ll4
-rw-r--r--test/CodeGen/X86/bool-vector.ll3
-rw-r--r--test/CodeGen/X86/broadcastm-lowering.ll7
-rw-r--r--test/CodeGen/X86/cmp.ll3
-rw-r--r--test/CodeGen/X86/combine-srl.ll2
-rw-r--r--test/CodeGen/X86/compress_expand.ll4
-rw-r--r--test/CodeGen/X86/emutls-pie.ll6
-rw-r--r--test/CodeGen/X86/emutls.ll16
-rw-r--r--test/CodeGen/X86/epilogue-cfi-fp.ll43
-rw-r--r--test/CodeGen/X86/epilogue-cfi-no-fp.ll46
-rw-r--r--test/CodeGen/X86/f16c-intrinsics.ll271
-rw-r--r--test/CodeGen/X86/fast-isel-int-float-conversion.ll12
-rw-r--r--test/CodeGen/X86/fast-isel-store.ll10
-rw-r--r--test/CodeGen/X86/fma-intrinsics-x86.ll874
-rw-r--r--test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll18
-rw-r--r--test/CodeGen/X86/frame-lowering-debug-intrinsic.ll4
-rw-r--r--test/CodeGen/X86/haddsub-2.ll12
-rw-r--r--test/CodeGen/X86/hipe-cc64.ll1
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smax.ll1896
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smin.ll1898
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umax.ll2203
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umin.ll2207
-rw-r--r--test/CodeGen/X86/illegal-bitfield-loadstore.ll1
-rw-r--r--test/CodeGen/X86/imul.ll3
-rw-r--r--test/CodeGen/X86/inline-asm-A-constraint.ll3
-rw-r--r--test/CodeGen/X86/lea-opt-cse1.ll1
-rw-r--r--test/CodeGen/X86/lea-opt-cse2.ll2
-rw-r--r--test/CodeGen/X86/lea-opt-cse3.ll2
-rw-r--r--test/CodeGen/X86/lea-opt-cse4.ll3
-rw-r--r--test/CodeGen/X86/legalize-shift-64.ll5
-rw-r--r--test/CodeGen/X86/live-out-reg-info.ll1
-rw-r--r--test/CodeGen/X86/load-combine.ll2
-rw-r--r--test/CodeGen/X86/masked_gather_scatter.ll34
-rw-r--r--test/CodeGen/X86/masked_memop.ll12
-rw-r--r--test/CodeGen/X86/memcmp-optsize.ll224
-rw-r--r--test/CodeGen/X86/memcmp.ll240
-rw-r--r--test/CodeGen/X86/memset-nonzero.ll1
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-128.ll19
-rw-r--r--test/CodeGen/X86/movtopush.ll4
-rw-r--r--test/CodeGen/X86/mul-constant-result.ll59
-rw-r--r--test/CodeGen/X86/mul-i256.ll8
-rw-r--r--test/CodeGen/X86/mul128.ll5
-rw-r--r--test/CodeGen/X86/no-plt.ll30
-rw-r--r--test/CodeGen/X86/pop-stack-cleanup-msvc.ll26
-rw-r--r--test/CodeGen/X86/pr21792.ll1
-rw-r--r--test/CodeGen/X86/pr29061.ll2
-rw-r--r--test/CodeGen/X86/pr29112.ll1
-rw-r--r--test/CodeGen/X86/pr30430.ll1
-rw-r--r--test/CodeGen/X86/pr32241.ll2
-rw-r--r--test/CodeGen/X86/pr32256.ll1
-rw-r--r--test/CodeGen/X86/pr32282.ll1
-rw-r--r--test/CodeGen/X86/pr32284.ll16
-rw-r--r--test/CodeGen/X86/pr32329.ll4
-rw-r--r--test/CodeGen/X86/pr32345.ll2
-rw-r--r--test/CodeGen/X86/pr32451.ll2
-rw-r--r--test/CodeGen/X86/pr34088.ll1
-rw-r--r--test/CodeGen/X86/pr34653.ll210
-rw-r--r--test/CodeGen/X86/pr34657.ll20
-rw-r--r--test/CodeGen/X86/pr9743.ll1
-rw-r--r--test/CodeGen/X86/push-cfi-debug.ll4
-rw-r--r--test/CodeGen/X86/push-cfi-obj.ll7
-rw-r--r--test/CodeGen/X86/push-cfi.ll3
-rw-r--r--test/CodeGen/X86/recip-fastmath.ll16
-rw-r--r--test/CodeGen/X86/recip-fastmath2.ll32
-rw-r--r--test/CodeGen/X86/return-ext.ll3
-rw-r--r--test/CodeGen/X86/rtm.ll1
-rw-r--r--test/CodeGen/X86/schedule-x86_32.ll348
-rw-r--r--test/CodeGen/X86/schedule-x86_64.ll737
-rw-r--r--test/CodeGen/X86/select-mmx.ll2
-rw-r--r--test/CodeGen/X86/select.ll38
-rw-r--r--test/CodeGen/X86/setcc-lowering.ll8
-rw-r--r--test/CodeGen/X86/shrink_vmul.ll13
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86.ll52
-rw-r--r--test/CodeGen/X86/sse-schedule.ll8
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86.ll28
-rw-r--r--test/CodeGen/X86/statepoint-call-lowering.ll1
-rw-r--r--test/CodeGen/X86/statepoint-gctransition-call-lowering.ll1
-rw-r--r--test/CodeGen/X86/statepoint-invoke.ll3
-rw-r--r--test/CodeGen/X86/throws-cfi-fp.ll98
-rw-r--r--test/CodeGen/X86/throws-cfi-no-fp.ll97
-rw-r--r--test/CodeGen/X86/var-permute-128.ll199
-rw-r--r--test/CodeGen/X86/var-permute-256.ll1020
-rw-r--r--test/CodeGen/X86/var-permute-512.ll618
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll74
-rw-r--r--test/CodeGen/X86/vector-half-conversions.ll2620
-rw-r--r--test/CodeGen/X86/vector-sext.ll13
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll18
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll8
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll45
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v8.ll21
-rw-r--r--test/CodeGen/X86/vector-shuffle-avx512.ll2
-rw-r--r--test/CodeGen/X86/vector-shuffle-v1.ll2
-rw-r--r--test/CodeGen/X86/vector-trunc.ll73
-rw-r--r--test/CodeGen/X86/wide-integer-cmp.ll3
-rw-r--r--test/CodeGen/X86/x86-framelowering-trap.ll1
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll1
-rw-r--r--test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll1
145 files changed, 14348 insertions, 4958 deletions
diff --git a/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll b/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll
index 6814ed1d894..109962c2859 100644
--- a/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll
+++ b/test/CodeGen/X86/2009-03-16-PHIElimInLPad.ll
@@ -23,6 +23,7 @@ lpad: ; preds = %cont, %entry
}
; CHECK: lpad
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: Ltmp
declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index 416761ffef4..dd059100503 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -88,6 +88,7 @@ define void @full_test() {
; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp)
; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; X32-NEXT: addl $60, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: full_test:
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
index 64a6313023b..9d28f441fb7 100644
--- a/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -20,6 +20,7 @@ define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
; X32-NEXT: addl 8(%ebp), %eax
; X32-NEXT: adcl 12(%ebp), %edx
; X32-NEXT: popl %ebp
+; X32-NEXT: .cfi_def_cfa %esp, 4
; X32-NEXT: retl
%ret = add i64 %arg1, %arg2
ret i64 %ret
diff --git a/test/CodeGen/X86/GlobalISel/brcond.ll b/test/CodeGen/X86/GlobalISel/brcond.ll
index 917ee6f5bd8..2467344776e 100644
--- a/test/CodeGen/X86/GlobalISel/brcond.ll
+++ b/test/CodeGen/X86/GlobalISel/brcond.ll
@@ -36,6 +36,7 @@ define i32 @test_1(i32 %a, i32 %b, i32 %tValue, i32 %fValue) {
; X32-NEXT: movl %eax, (%esp)
; X32-NEXT: movl (%esp), %eax
; X32-NEXT: popl %ecx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
%retval = alloca i32, align 4
diff --git a/test/CodeGen/X86/GlobalISel/callingconv.ll b/test/CodeGen/X86/GlobalISel/callingconv.ll
index 4100a7217ac..23987a3c365 100644
--- a/test/CodeGen/X86/GlobalISel/callingconv.ll
+++ b/test/CodeGen/X86/GlobalISel/callingconv.ll
@@ -117,6 +117,7 @@ define <8 x i32> @test_v8i32_args(<8 x i32> %arg1, <8 x i32> %arg2) {
; X32-NEXT: movups 16(%esp), %xmm1
; X32-NEXT: movaps %xmm2, %xmm0
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_v8i32_args:
@@ -135,6 +136,7 @@ define void @test_trivial_call() {
; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: calll trivial_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_trivial_call:
@@ -143,6 +145,7 @@ define void @test_trivial_call() {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: callq trivial_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void @trivial_callee()
ret void
@@ -160,6 +163,7 @@ define void @test_simple_arg_call(i32 %in0, i32 %in1) {
; X32-NEXT: movl %eax, 4(%esp)
; X32-NEXT: calll simple_arg_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_simple_arg_call:
@@ -171,6 +175,7 @@ define void @test_simple_arg_call(i32 %in0, i32 %in1) {
; X64-NEXT: movl %eax, %esi
; X64-NEXT: callq simple_arg_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void @simple_arg_callee(i32 %in1, i32 %in0)
ret void
@@ -193,6 +198,7 @@ define void @test_simple_arg8_call(i32 %in0) {
; X32-NEXT: movl %eax, 28(%esp)
; X32-NEXT: calll simple_arg8_callee
; X32-NEXT: addl $44, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_simple_arg8_call:
@@ -208,6 +214,7 @@ define void @test_simple_arg8_call(i32 %in0) {
; X64-NEXT: movl %edi, %r9d
; X64-NEXT: callq simple_arg8_callee
; X64-NEXT: addq $24, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void @simple_arg8_callee(i32 %in0, i32 %in0, i32 %in0, i32 %in0,i32 %in0, i32 %in0, i32 %in0, i32 %in0)
ret void
@@ -224,6 +231,7 @@ define i32 @test_simple_return_callee() {
; X32-NEXT: calll simple_return_callee
; X32-NEXT: addl %eax, %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_simple_return_callee:
@@ -234,6 +242,7 @@ define i32 @test_simple_return_callee() {
; X64-NEXT: callq simple_return_callee
; X64-NEXT: addl %eax, %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%call = call i32 @simple_return_callee(i32 5)
%r = add i32 %call, %call
@@ -254,6 +263,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
; X32-NEXT: paddd (%esp), %xmm0 # 16-byte Folded Reload
; X32-NEXT: paddd 16(%esp), %xmm1 # 16-byte Folded Reload
; X32-NEXT: addl $44, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_split_return_callee:
@@ -268,6 +278,7 @@ define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
; X64-NEXT: paddd (%rsp), %xmm0 # 16-byte Folded Reload
; X64-NEXT: paddd 16(%rsp), %xmm1 # 16-byte Folded Reload
; X64-NEXT: addq $40, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%call = call <8 x i32> @split_return_callee(<8 x i32> %arg2)
%r = add <8 x i32> %arg1, %call
@@ -281,6 +292,7 @@ define void @test_indirect_call(void()* %func) {
; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: calll *16(%esp)
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_indirect_call:
@@ -289,6 +301,7 @@ define void @test_indirect_call(void()* %func) {
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: callq *%rdi
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
call void %func()
ret void
@@ -317,8 +330,11 @@ define void @test_abi_exts_call(i8* %addr) {
; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: calll take_char
; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_abi_exts_call:
@@ -335,6 +351,7 @@ define void @test_abi_exts_call(i8* %addr) {
; X64-NEXT: movl %ebx, %edi
; X64-NEXT: callq take_char
; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%val = load i8, i8* %addr
call void @take_char(i8 %val)
@@ -357,6 +374,7 @@ define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) {
; X32-NEXT: movl %ecx, 4(%esp)
; X32-NEXT: calll variadic_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_variadic_call_1:
@@ -368,6 +386,7 @@ define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) {
; X64-NEXT: movb $0, %al
; X64-NEXT: callq variadic_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%addr = load i8*, i8** %addr_ptr
@@ -393,6 +412,7 @@ define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) {
; X32-NEXT: movl %ecx, 4(%eax)
; X32-NEXT: calll variadic_callee
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_variadic_call_2:
@@ -405,6 +425,7 @@ define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) {
; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: callq variadic_callee
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
%addr = load i8*, i8** %addr_ptr
diff --git a/test/CodeGen/X86/GlobalISel/frameIndex.ll b/test/CodeGen/X86/GlobalISel/frameIndex.ll
index 7b2a050f153..f260d0d707f 100644
--- a/test/CodeGen/X86/GlobalISel/frameIndex.ll
+++ b/test/CodeGen/X86/GlobalISel/frameIndex.ll
@@ -18,6 +18,7 @@ define i32* @allocai32() {
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movl %esp, %eax
; X32-NEXT: popl %ecx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32ABI-LABEL: allocai32:
diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir
index 9058f010f76..3457e971b8d 100644
--- a/test/CodeGen/X86/GlobalISel/select-cmp.mir
+++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir
@@ -100,7 +100,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr8 = COPY %sil
; CHECK: CMP8rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -131,7 +131,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr16 = COPY %si
; CHECK: CMP16rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -162,7 +162,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr64 = COPY %rsi
; CHECK: CMP64rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -193,7 +193,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -224,7 +224,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETNEr:%[0-9]+]]:gr8 = SETNEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETNEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETNEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -255,7 +255,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETAr:%[0-9]+]]:gr8 = SETAr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -286,7 +286,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETAEr:%[0-9]+]]:gr8 = SETAEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -317,7 +317,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETBr:%[0-9]+]]:gr8 = SETBr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -348,7 +348,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETBEr:%[0-9]+]]:gr8 = SETBEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -379,7 +379,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETGr:%[0-9]+]]:gr8 = SETGr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -410,7 +410,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETGEr:%[0-9]+]]:gr8 = SETGEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -441,7 +441,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETLr:%[0-9]+]]:gr8 = SETLr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
@@ -472,7 +472,7 @@ body: |
; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
; CHECK: [[SETLEr:%[0-9]+]]:gr8 = SETLEr implicit %eflags
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLEr]], 1
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLEr]], %subreg.sub_8bit
; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; CHECK: %eax = COPY [[AND32ri8_]]
; CHECK: RET 0, implicit %eax
diff --git a/test/CodeGen/X86/GlobalISel/select-copy.mir b/test/CodeGen/X86/GlobalISel/select-copy.mir
index a72f42782c0..fccba1f8206 100644
--- a/test/CodeGen/X86/GlobalISel/select-copy.mir
+++ b/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -42,7 +42,7 @@ registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
# ALL: %0:gr8 = COPY %al
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
@@ -146,7 +146,7 @@ regBankSelected: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
# ALL: %0:gr8 = COPY %dl
-# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
@@ -170,7 +170,7 @@ regBankSelected: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
# ALL: %0:gr16 = COPY %dx
-# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, 3
+# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_16bit
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
index 51088e126e5..9df24f65b36 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
@@ -39,7 +39,7 @@ body: |
; ALL-LABEL: name: test_zext_i1
; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil
; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]]
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
; ALL: [[AND64ri8_:%[0-9]+]]:gr64 = AND64ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
; ALL: %rax = COPY [[AND64ri8_]]
; ALL: RET 0, implicit %rax
@@ -112,7 +112,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s1
; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
@@ -137,7 +137,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s8
; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 1
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
@@ -162,7 +162,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s16
; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 3
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_16bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
@@ -187,7 +187,7 @@ body: |
; ALL-LABEL: name: anyext_s64_from_s32
; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
; ALL: [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]].sub_32bit
- ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], 4
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_32bit
; ALL: %rax = COPY [[SUBREG_TO_REG]]
; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir
index 5167ee987a5..90ac0c6763a 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext.mir
@@ -85,7 +85,7 @@ registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
# ALL: %0:gr8 = COPY %dil
-# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %1:gr16 = AND16ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %ax = COPY %1
# ALL-NEXT: RET 0, implicit %ax
@@ -113,7 +113,7 @@ registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
# ALL: %0:gr8 = COPY %dil
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, 1
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
# ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
@@ -288,7 +288,7 @@ registers:
# X32: %0:gr32_abcd = COPY %edi
# X64: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
# ALL-NEXT: %ax = COPY %2
# ALL-NEXT: RET 0, implicit %ax
body: |
@@ -323,7 +323,7 @@ registers:
# X32: %0:gr32_abcd = COPY %edi
# X64: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
# ALL-NEXT: %eax = COPY %2
# ALL-NEXT: RET 0, implicit %eax
body: |
@@ -358,7 +358,7 @@ registers:
# X32: %0:gr32_abcd = COPY %edi
# X64: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, 1
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
# ALL-NEXT: %ax = COPY %2
# ALL-NEXT: RET 0, implicit %ax
body: |
@@ -422,7 +422,7 @@ registers:
- { id: 2, class: gpr }
# ALL: %0:gr32 = COPY %edi
# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, 3
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_16bit
# ALL-NEXT: %eax = COPY %2
# ALL-NEXT: RET 0, implicit %eax
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
index 596c48b4922..628ab3bac4a 100644
--- a/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
+++ b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
@@ -20,7 +20,7 @@ body: |
bb.0:
; CHECK-LABEL: name: read_flags
; CHECK: [[RDFLAGS32_:%[0-9]+]]:gr32 = RDFLAGS32 implicit-def %esp, implicit %esp
- ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[RDFLAGS32_]], 4
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[RDFLAGS32_]], %subreg.sub_32bit
; CHECK: %rax = COPY [[SUBREG_TO_REG]]
%0(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.x86.flags.read.u32)
%rax = COPY %0(s32)
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
index 1f7415ee2af..8ecafad8022 100644
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -49,6 +49,7 @@
; CHECK-NEXT: X86 pseudo instruction expansion pass
; CHECK-NEXT: Analyze Machine Code For Garbage Collection
; CHECK-NEXT: X86 vzeroupper inserter
+; CHECK-NEXT: Check CFA info and insert CFI instructions if needed
; CHECK-NEXT: Contiguously Lay Out Funclets
; CHECK-NEXT: StackMap Liveness Analysis
; CHECK-NEXT: Live DEBUG_VALUE analysis
diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll
index b9ae57ca011..ed98fd51cc0 100644
--- a/test/CodeGen/X86/TruncAssertZext.ll
+++ b/test/CodeGen/X86/TruncAssertZext.ll
@@ -25,6 +25,7 @@ define i64 @main() {
; CHECK-NEXT: subq %rcx, %rax
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%b = call i64 @foo()
%or = and i64 %b, 18446744069414584575 ; this is 0xffffffff000000ff
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 508f10e9889..14494779f10 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -2209,62 +2209,53 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v32i8_const:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm5
-; SSE2-NEXT: movdqa 16(%rdi), %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [5,6,7,8]
-; SSE2-NEXT: paddd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,3,4]
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: paddd %xmm9, %xmm6
-; SSE2-NEXT: paddd %xmm3, %xmm4
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa 16(%rdi), %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4]
+; SSE2-NEXT: paddd %xmm9, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8]
+; SSE2-NEXT: paddd %xmm4, %xmm8
; SSE2-NEXT: paddd %xmm9, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm8
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm9, %xmm3
+; SSE2-NEXT: paddd %xmm4, %xmm6
; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm7
+; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm8
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: packuswb %xmm7, %xmm1
; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm7
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: packuswb %xmm5, %xmm7
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: packuswb %xmm6, %xmm4
-; SSE2-NEXT: packuswb %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm8
-; SSE2-NEXT: packuswb %xmm2, %xmm8
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: psrld $1, %xmm8
+; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: packuswb %xmm8, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: movdqu %xmm4, (%rax)
+; SSE2-NEXT: packuswb %xmm0, %xmm2
+; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8_const:
@@ -2277,9 +2268,9 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4]
; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8]
; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5
; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
@@ -2287,30 +2278,21 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm8
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm7
-; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm5
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3
+; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -2567,49 +2549,40 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm9
+; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm10
+; AVX2-NEXT: vpsrld $1, %ymm0, %ymm8
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm3
-; AVX2-NEXT: vpsrld $1, %ymm9, %ymm8
+; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6
-; AVX2-NEXT: vpsrld $1, %ymm7, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7
-; AVX2-NEXT: vpackssdw %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0
-; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; AVX2-NEXT: vpsrld $1, %ymm7, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0
+; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm0
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
+; AVX2-NEXT: vpackuswb %xmm0, %xmm6, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX2-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm8, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm10, %xmm4
-; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm8, %xmm3
+; AVX2-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 923e1b9b0e0..dc386415934 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -12,7 +12,6 @@ define void @zero128() nounwind ssp {
; CHECK-NEXT: movq _z@{{.*}}(%rip), %rax
; CHECK-NEXT: vmovaps %xmm0, (%rax)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
store <4 x float> zeroinitializer, <4 x float>* @z, align 16
ret void
}
@@ -27,7 +26,6 @@ define void @zero256() nounwind ssp {
; CHECK-NEXT: vmovaps %ymm0, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
store <8 x float> zeroinitializer, <8 x float>* @x, align 32
store <4 x double> zeroinitializer, <4 x double>* @y, align 32
ret void
@@ -41,7 +39,6 @@ define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nou
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
allocas:
%ptr2vec615 = bitcast [0 x float]* %RET to <8 x float>*
store <8 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000, float
@@ -59,7 +56,6 @@ define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwi
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
allocas:
%ptr2vec615 = bitcast [0 x i32]* %RET to <8 x i32>*
store <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32>* %ptr2vec615, align 32
@@ -83,7 +79,6 @@ define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*
%val.i34.i = load i32, i32* %ptrcast.i33.i, align 4
%ptroffset.i22.i992 = getelementptr [0 x float], [0 x float]* %aFOO, i64 0, i64 1
@@ -102,7 +97,6 @@ define <16 x float> @fneg(<16 x float> %a) nounwind {
; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
ret <16 x float> %1
}
@@ -114,7 +108,6 @@ define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly {
; CHECK-NEXT: movzwl %di, %eax
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%res = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %a, i32 0
ret <16 x i16> %res
}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 44eb14160ee..e508e345de6 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -581,15 +581,10 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rcp_ps_256:
-; AVX: # BB#0:
-; AVX-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rcp_ps_256:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vrcp14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0]
-; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rcp_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -619,15 +614,10 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX: # BB#0:
-; AVX-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
-; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vrsqrt14ps %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0]
-; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rsqrt_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -635,10 +625,15 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_pd_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX: # BB#0:
+; AVX-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -646,10 +641,15 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_ps_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
-; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX: # BB#0:
+; AVX-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index 44d13db65c9..858a27b1d48 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -3982,8 +3982,8 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
;
; SKX-LABEL: test_rcpps:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT: vrcp14ps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:1.00]
; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@@ -4174,8 +4174,8 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
;
; SKX-LABEL: test_rsqrtps:
; SKX: # BB#0:
-; SKX-NEXT: vrsqrt14ps %ymm0, %ymm0 # sched: [4:1.00]
-; SKX-NEXT: vrsqrt14ps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:1.00]
; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index b75bd8cc3ee..909e8398680 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -699,11 +699,13 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; AVX512BW-NEXT: jg LBB17_1
; AVX512BW-NEXT: ## BB#2:
; AVX512BW-NEXT: vpcmpltud %zmm2, %zmm1, %k0
-; AVX512BW-NEXT: jmp LBB17_3
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
; AVX512BW-NEXT: LBB17_1:
-; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
-; AVX512BW-NEXT: LBB17_3:
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-regcall-Mask.ll b/test/CodeGen/X86/avx512-regcall-Mask.ll
index bb541f46567..fa6adec675f 100644
--- a/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -209,12 +209,18 @@ define i64 @caller_argv64i1() #0 {
; LINUXOSX64-NEXT: pushq %rax
; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8
; LINUXOSX64-NEXT: callq test_argv64i1
-; LINUXOSX64-NEXT: addq $24, %rsp
+; LINUXOSX64-NEXT: addq $16, %rsp
; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset -16
+; LINUXOSX64-NEXT: addq $8, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 40
; LINUXOSX64-NEXT: popq %r12
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32
; LINUXOSX64-NEXT: popq %r13
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 24
; LINUXOSX64-NEXT: popq %r14
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %r15
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i64 4294967298 to <64 x i1>
@@ -287,6 +293,7 @@ define <64 x i1> @caller_retv64i1() #0 {
; LINUXOSX64-NEXT: kmovq %rax, %k0
; LINUXOSX64-NEXT: vpmovm2b %k0, %zmm0
; LINUXOSX64-NEXT: popq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <64 x i1> @test_retv64i1()
@@ -397,7 +404,9 @@ define x86_regcallcc i32 @test_argv32i1(<32 x i1> %x0, <32 x i1> %x1, <32 x i1>
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: vzeroupper
; LINUXOSX64-NEXT: retq
entry:
@@ -451,6 +460,7 @@ define i32 @caller_argv32i1() #0 {
; LINUXOSX64-NEXT: movl $1, %edx
; LINUXOSX64-NEXT: callq test_argv32i1
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i32 1 to <32 x i1>
@@ -513,6 +523,7 @@ define i32 @caller_retv32i1() #0 {
; LINUXOSX64-NEXT: callq test_retv32i1
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <32 x i1> @test_retv32i1()
@@ -626,7 +637,9 @@ define x86_regcallcc i16 @test_argv16i1(<16 x i1> %x0, <16 x i1> %x1, <16 x i1>
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%res = call i16 @test_argv16i1helper(<16 x i1> %x0, <16 x i1> %x1, <16 x i1> %x2)
ret i16 %res
@@ -678,6 +691,7 @@ define i16 @caller_argv16i1() #0 {
; LINUXOSX64-NEXT: movl $1, %edx
; LINUXOSX64-NEXT: callq test_argv16i1
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i16 1 to <16 x i1>
@@ -746,6 +760,7 @@ define i16 @caller_retv16i1() #0 {
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <16 x i1> @test_retv16i1()
@@ -859,7 +874,9 @@ define x86_regcallcc i8 @test_argv8i1(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2)
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%res = call i8 @test_argv8i1helper(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2)
ret i8 %res
@@ -911,6 +928,7 @@ define i8 @caller_argv8i1() #0 {
; LINUXOSX64-NEXT: movl $1, %edx
; LINUXOSX64-NEXT: callq test_argv8i1
; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i8 1 to <8 x i1>
@@ -984,9 +1002,11 @@ define <8 x i1> @caller_retv8i1() #0 {
; LINUXOSX64-NEXT: vpmovm2w %k0, %zmm0
; LINUXOSX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; LINUXOSX64-NEXT: popq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: vzeroupper
; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <8 x i1> @test_retv8i1()
ret <8 x i1> %call
}
+
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 43a1871245b..b4f1d2c776d 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -63,6 +63,7 @@ define x86_regcallcc i1 @test_CallargReti1(i1 %a) {
; LINUXOSX64-NEXT: callq test_argReti1
; LINUXOSX64-NEXT: incb %al
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i1 %a, 1
%c = call x86_regcallcc i1 @test_argReti1(i1 %b)
@@ -130,6 +131,7 @@ define x86_regcallcc i8 @test_CallargReti8(i8 %a) {
; LINUXOSX64-NEXT: callq test_argReti8
; LINUXOSX64-NEXT: incb %al
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i8 %a, 1
%c = call x86_regcallcc i8 @test_argReti8(i8 %b)
@@ -200,6 +202,7 @@ define x86_regcallcc i16 @test_CallargReti16(i16 %a) {
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i16 %a, 1
%c = call x86_regcallcc i16 @test_argReti16(i16 %b)
@@ -261,6 +264,7 @@ define x86_regcallcc i32 @test_CallargReti32(i32 %a) {
; LINUXOSX64-NEXT: callq test_argReti32
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i32 %a, 1
%c = call x86_regcallcc i32 @test_argReti32(i32 %b)
@@ -327,6 +331,7 @@ define x86_regcallcc i64 @test_CallargReti64(i64 %a) {
; LINUXOSX64-NEXT: callq test_argReti64
; LINUXOSX64-NEXT: incq %rax
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = add i64 %a, 1
%c = call x86_regcallcc i64 @test_argReti64(i64 %b)
@@ -406,7 +411,9 @@ define x86_regcallcc float @test_CallargRetFloat(float %a) {
; LINUXOSX64-NEXT: vaddss %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = fadd float 1.0, %a
%c = call x86_regcallcc float @test_argRetFloat(float %b)
@@ -486,7 +493,9 @@ define x86_regcallcc double @test_CallargRetDouble(double %a) {
; LINUXOSX64-NEXT: vaddsd %xmm8, %xmm0, %xmm0
; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = fadd double 1.0, %a
%c = call x86_regcallcc double @test_argRetDouble(double %b)
@@ -548,6 +557,7 @@ define x86_regcallcc x86_fp80 @test_CallargRetf80(x86_fp80 %a) {
; LINUXOSX64-NEXT: callq test_argRetf80
; LINUXOSX64-NEXT: fadd %st(0), %st(0)
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = fadd x86_fp80 %a, %a
%c = call x86_regcallcc x86_fp80 @test_argRetf80(x86_fp80 %b)
@@ -611,6 +621,7 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) {
; LINUXOSX64-NEXT: callq test_argRetPointer
; LINUXOSX64-NEXT: incl %eax
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = ptrtoint [4 x i32]* %a to i32
%c = add i32 %b, 1
@@ -694,7 +705,9 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) {
; LINUXOSX64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1}
; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %a)
%c = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b
@@ -768,7 +781,9 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) {
; LINUXOSX64-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
; LINUXOSX64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; LINUXOSX64-NEXT: addq $48, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %a)
%c = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b
@@ -842,7 +857,9 @@ define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i32> %a) {
; LINUXOSX64-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
; LINUXOSX64-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; LINUXOSX64-NEXT: addq $112, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 8
; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %a)
%c = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index 8372fbdb9ab..abc8c1a7513 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -8839,6 +8839,7 @@ define <16 x float> @broadcast_ss_spill(float %x) {
; GENERIC-NEXT: callq func_f32
; GENERIC-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload
; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: .cfi_def_cfa_offset 8
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: broadcast_ss_spill:
@@ -8852,6 +8853,7 @@ define <16 x float> @broadcast_ss_spill(float %x) {
; SKX-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50]
; SKX-NEXT: # sched: [8:0.50]
; SKX-NEXT: addq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: .cfi_def_cfa_offset 8
; SKX-NEXT: retq # sched: [7:1.00]
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -8872,6 +8874,7 @@ define <8 x double> @broadcast_sd_spill(double %x) {
; GENERIC-NEXT: callq func_f64
; GENERIC-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload
; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: .cfi_def_cfa_offset 8
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: broadcast_sd_spill:
@@ -8885,6 +8888,7 @@ define <8 x double> @broadcast_sd_spill(double %x) {
; SKX-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50]
; SKX-NEXT: # sched: [8:0.50]
; SKX-NEXT: addq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: .cfi_def_cfa_offset 8
; SKX-NEXT: retq # sched: [7:1.00]
%a = fadd double %x, %x
call void @func_f64(double %a)
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 43cf9ee7358..51a7c685ed4 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -115,6 +115,7 @@ define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
; X86-NEXT: vmovaps 8(%ebp), %zmm1
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa %esp, 4
; X86-NEXT: retl
;
; X64-LABEL: select04:
diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll
index c59fb5b97bc..c95f0d40fbf 100755
--- a/test/CodeGen/X86/avx512-shuffle-schedule.ll
+++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll
@@ -9533,18 +9533,18 @@ define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9555,18 +9555,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9576,18 +9574,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9598,18 +9596,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9619,18 +9615,18 @@ define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8
define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9641,18 +9637,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo
define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9675,18 +9669,18 @@ define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %ve
define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9697,18 +9691,16 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo
define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -9732,18 +9724,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9755,18 +9747,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9778,18 +9768,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9801,18 +9791,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -9824,18 +9812,18 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9847,18 +9835,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9884,18 +9870,18 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -9907,18 +9893,16 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -10337,18 +10321,18 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10359,18 +10343,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10380,18 +10362,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10402,18 +10384,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10423,18 +10403,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10445,18 +10425,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10479,18 +10457,18 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10501,18 +10479,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -10536,18 +10512,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10559,18 +10535,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10582,18 +10556,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10605,18 +10579,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10628,18 +10600,18 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10651,18 +10623,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -10688,18 +10658,18 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -10711,18 +10681,16 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -11128,12 +11096,12 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec
define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
; GENERIC-LABEL: test_8xi32_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
ret <8 x i32> %res
@@ -11141,18 +11109,18 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11163,18 +11131,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11184,18 +11150,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11206,18 +11172,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11227,18 +11191,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11249,18 +11213,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11270,12 +11232,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
; GENERIC-LABEL: test_8xi32_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
ret <8 x i32> %res
@@ -11283,18 +11245,18 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11305,18 +11267,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -11326,12 +11286,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11340,18 +11300,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11363,18 +11323,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -11386,18 +11344,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11409,18 +11367,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11432,18 +11388,18 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11455,18 +11411,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11478,12 +11432,12 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11492,18 +11446,18 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11515,18 +11469,16 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -11932,12 +11884,12 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16
define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
; GENERIC-LABEL: test_4xi64_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x i64> %res
@@ -11945,18 +11897,18 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -11967,18 +11919,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -11988,18 +11938,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12010,18 +11960,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12031,18 +11979,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12053,18 +12001,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12074,12 +12020,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
; GENERIC-LABEL: test_4xi64_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x i64> %res
@@ -12087,18 +12033,18 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12109,18 +12055,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -12130,12 +12074,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12144,18 +12088,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12167,18 +12111,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12190,18 +12132,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12213,18 +12155,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12236,18 +12176,18 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12259,18 +12199,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -12282,12 +12220,12 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12296,18 +12234,18 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -12319,18 +12257,16 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; GENERIC: # BB#0:
-; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; SKX: # BB#0:
-; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
index c957a85a885..799bbc11bee 100644
--- a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
+++ b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
@@ -14,10 +14,10 @@ define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec
define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -28,10 +28,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -41,10 +40,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -55,10 +54,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -68,10 +66,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -82,10 +80,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -103,10 +100,10 @@ define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec
define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -117,10 +114,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x floa
define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -139,10 +135,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -154,10 +150,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -169,10 +164,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -184,10 +179,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -199,10 +193,10 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1,
define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -214,10 +208,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -238,10 +231,10 @@ define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>*
define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -253,10 +246,9 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x
define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x float>, <8 x float>* %vec2p
%shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -530,10 +522,10 @@ define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -544,10 +536,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -557,10 +548,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -571,10 +562,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -584,10 +574,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <
define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -598,10 +588,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -619,10 +608,10 @@ define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double>
define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -633,10 +622,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d
define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqpd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -655,10 +643,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -670,10 +658,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -685,10 +672,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -700,10 +687,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -715,10 +701,10 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec
define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -730,10 +716,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -754,10 +739,10 @@ define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x doub
define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -769,10 +754,9 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4
define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x double>, <4 x double>* %vec2p
%shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1038,7 +1022,7 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec
define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
; CHECK-LABEL: test_8xi32_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
ret <8 x i32> %res
@@ -1046,10 +1030,10 @@ define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1060,10 +1044,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1073,10 +1056,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1087,10 +1070,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1100,10 +1082,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1114,10 +1096,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1127,7 +1108,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
; CHECK-LABEL: test_8xi32_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
ret <8 x i32> %res
@@ -1135,10 +1116,10 @@ define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1149,10 +1130,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2
define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1162,7 +1142,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32>
define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; CHECK-LABEL: test_8xi32_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1171,10 +1151,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1186,10 +1166,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1201,10 +1180,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1216,10 +1195,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1231,10 +1209,10 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1246,10 +1224,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1261,7 +1238,7 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i
define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
; CHECK-LABEL: test_8xi32_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1270,10 +1247,10 @@ define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p)
define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1285,10 +1262,9 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>*
define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
; CHECK-NEXT: retq
%vec2 = load <8 x i32>, <8 x i32>* %vec2p
%shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1554,7 +1530,7 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16
define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
; CHECK-LABEL: test_4xi64_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
ret <4 x i64> %res
@@ -1562,10 +1538,10 @@ define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1576,10 +1552,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1589,10 +1564,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1603,10 +1578,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1616,10 +1590,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1630,10 +1604,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1643,7 +1616,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
; CHECK-LABEL: test_4xi64_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x i64> %res
@@ -1651,10 +1624,10 @@ define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1665,10 +1638,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2
define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1678,7 +1650,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64>
define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; CHECK-LABEL: test_4xi64_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1687,10 +1659,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1702,10 +1674,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1717,10 +1688,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1732,10 +1703,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1747,10 +1717,10 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1762,10 +1732,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1777,7 +1746,7 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i
define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
; CHECK-LABEL: test_4xi64_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1786,10 +1755,10 @@ define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p)
define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1801,10 +1770,9 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>*
define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
; CHECK: # BB#0:
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
; CHECK-NEXT: retq
%vec2 = load <4 x i64>, <4 x i64>* %vec2p
%shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index 23d66457994..ff25c005e9c 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -46,8 +46,6 @@ define <8 x i1> @test3(<4 x i1> %a) {
; CHECK: # BB#0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
-; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 584968f1c6e..9aacb23fbd5 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -413,6 +413,7 @@ define <16 x float> @broadcast_ss_spill(float %x) {
; ALL-NEXT: callq func_f32
; ALL-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload
; ALL-NEXT: addq $24, %rsp
+; ALL-NEXT: .cfi_def_cfa_offset 8
; ALL-NEXT: retq
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -432,6 +433,7 @@ define <8 x double> @broadcast_sd_spill(double %x) {
; ALL-NEXT: callq func_f64
; ALL-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload
; ALL-NEXT: addq $24, %rsp
+; ALL-NEXT: .cfi_def_cfa_offset 8
; ALL-NEXT: retq
%a = fadd double %x, %x
call void @func_f64(double %a)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
index d1bf8fd5f3f..7f170cd51bf 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -717,6 +717,7 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: vpbroadcastb %eax, %zmm3 {%k1}
; X32-NEXT: vmovdqa64 %zmm3, %zmm0
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_set1_epi8:
@@ -1444,6 +1445,7 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: korq %k0, %k1, %k1
; X32-NEXT: vpbroadcastb %eax, %zmm0 {%k1} {z}
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_set1_epi8:
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index a5ef1809157..87565ac129b 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -355,6 +355,7 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
@@ -380,6 +381,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
@@ -445,6 +447,7 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
@@ -470,6 +473,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
@@ -1702,6 +1706,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -2503,8 +2508,11 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512F-32-NEXT: addl %esi, %eax
; AVX512F-32-NEXT: adcl %ecx, %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
; AVX512F-32-NEXT: popl %esi
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: popl %ebx
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -2586,6 +2594,7 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -3387,8 +3396,11 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512F-32-NEXT: addl %esi, %eax
; AVX512F-32-NEXT: adcl %ecx, %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
; AVX512F-32-NEXT: popl %esi
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: popl %ebx
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index e23deebd15b..c2620642e5c 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1499,6 +1499,7 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
ret i64 %res
@@ -1522,6 +1523,7 @@ define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0)
ret i64 %res
@@ -1712,6 +1714,7 @@ define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
%res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
@@ -1776,6 +1779,7 @@ define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 4
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
%res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
diff --git a/test/CodeGen/X86/avx512bw-vec-test-testn.ll b/test/CodeGen/X86/avx512bw-vec-test-testn.ll
index 6dd6440faa1..82d0b8846de 100644
--- a/test/CodeGen/X86/avx512bw-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512bw-vec-test-testn.ll
@@ -5,9 +5,7 @@
define zeroext i32 @TEST_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -24,9 +22,7 @@ entry:
define zeroext i64 @TEST_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -42,10 +38,8 @@ entry:
define zeroext i32 @TEST_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -63,10 +57,8 @@ entry:
define zeroext i64 @TEST_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovq %rdi, %k1
-; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -84,9 +76,7 @@ entry:
define zeroext i32 @TEST_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -103,9 +93,7 @@ entry:
define zeroext i64 @TEST_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -121,10 +109,8 @@ entry:
define zeroext i32 @TEST_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -142,10 +128,8 @@ entry:
define zeroext i64 @TEST_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovq %rdi, %k1
-; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovq %k0, %rax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
index f67ceb2fe04..44075deb1d9 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
@@ -5,9 +5,7 @@
define zeroext i16 @TEST_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -23,10 +21,8 @@ entry:
define zeroext i16 @TEST_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -44,9 +40,7 @@ entry:
define zeroext i8 @TEST_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -62,10 +56,8 @@ entry:
define zeroext i8 @TEST_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -83,9 +75,7 @@ entry:
define zeroext i16 @TEST_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -101,10 +91,8 @@ entry:
define zeroext i16 @TEST_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -122,9 +110,7 @@ entry:
define zeroext i8 @TEST_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -140,10 +126,8 @@ entry:
define zeroext i8 @TEST_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm_mask_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: retq
@@ -161,9 +145,7 @@ entry:
define i32 @TEST_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -179,10 +161,8 @@ entry:
define i32 @TEST_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_test_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -200,9 +180,7 @@ entry:
define zeroext i16 @TEST_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -219,10 +197,8 @@ entry:
define zeroext i16 @TEST_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_test_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -241,9 +217,7 @@ entry:
define i32 @TEST_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -259,10 +233,8 @@ entry:
define i32 @TEST_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_testn_epi8_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -280,9 +252,7 @@ entry:
define zeroext i16 @TEST_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -299,10 +269,8 @@ entry:
define zeroext i16 @TEST_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm256_mask_testn_epi16_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovd %edi, %k1
-; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1}
; CHECK-NEXT: kmovd %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll
new file mode 100644
index 00000000000..ca5e5523a9d
--- /dev/null
+++ b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s
+
+define <8 x i64> @test_mm512_broadcastmb_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_broadcastmb_epi64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = icmp eq <8 x i64> %a, %b
+ %1 = bitcast <8 x i1> %0 to i8
+ %conv.i = zext i8 %1 to i64
+ %vecinit.i.i = insertelement <8 x i64> undef, i64 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64> %vecinit7.i.i
+}
+
+define <8 x i64> @test_mm512_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_broadcastmw_epi32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <16 x i32> undef, i32 %conv.i, i32 0
+ %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
+ %4 = bitcast <16 x i32> %vecinit15.i.i to <8 x i64>
+ ret <8 x i64> %4
+}
+
+
diff --git a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
index e5dbff9ac51..92dfe1e087a 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
@@ -45,3 +45,26 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
%res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret <8 x i64> %res
}
+
+define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0)
+ ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
+
+define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0)
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
+
diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll
index 7e5a3e8fe25..ab8c80f8dd3 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics.ll
@@ -1,28 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
-define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0)
- ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
-
-define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0)
- ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
-
declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
define <8 x i64> @test_conflict_q(<8 x i64> %a) {
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
index f8f47c87100..0e310be3489 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
@@ -69,3 +69,47 @@ define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64>
ret <4 x i64> %res2
}
+define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
+
+define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
+
+define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
+
+define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
+
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 96254f7c95b..2fb50297c62 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -147,46 +147,3 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i
ret <4 x i64> %res2
}
-define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
-
-define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
-
-define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
-
-define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
diff --git a/test/CodeGen/X86/avx512f-vec-test-testn.ll b/test/CodeGen/X86/avx512f-vec-test-testn.ll
index c9c0c2251a4..e9cdacc354f 100644
--- a/test/CodeGen/X86/avx512f-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512f-vec-test-testn.ll
@@ -5,9 +5,7 @@
define zeroext i8 @TEST_mm512_test_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -23,9 +21,7 @@ entry:
define zeroext i16 @TEST_mm512_test_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -42,10 +38,8 @@ entry:
define zeroext i8 @TEST_mm512_mask_test_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -63,10 +57,8 @@ entry:
define zeroext i16 @TEST_mm512_mask_test_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_test_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -85,9 +77,7 @@ entry:
define zeroext i8 @TEST_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -103,9 +93,7 @@ entry:
define zeroext i16 @TEST_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -122,10 +110,8 @@ entry:
define zeroext i8 @TEST_mm512_mask_testn_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi64_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
@@ -143,10 +129,8 @@ entry:
define zeroext i16 @TEST_mm512_mask_testn_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_mask_testn_epi32_mask:
; CHECK: # BB#0: # %entry
-; CHECK-NEXT: vpandq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; CHECK-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index f5578d6cc88..3f4a696af0c 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -233,6 +233,7 @@ define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64>
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastd_epi32:
@@ -265,6 +266,7 @@ define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastd_epi32:
@@ -369,6 +371,7 @@ define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64>
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastq_epi64:
@@ -398,6 +401,7 @@ define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastq_epi64:
@@ -441,6 +445,7 @@ define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastq_epi64:
@@ -470,6 +475,7 @@ define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
@@ -513,6 +519,7 @@ define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastsd_pd:
@@ -542,6 +549,7 @@ define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastsd_pd:
@@ -585,6 +593,7 @@ define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastsd_pd:
@@ -614,6 +623,7 @@ define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
@@ -657,6 +667,7 @@ define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x fl
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastss_ps:
@@ -686,6 +697,7 @@ define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastss_ps:
@@ -781,6 +793,7 @@ define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x doub
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movddup_pd:
@@ -810,6 +823,7 @@ define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movddup_pd:
@@ -853,6 +867,7 @@ define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x d
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_movddup_pd:
@@ -882,6 +897,7 @@ define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_movddup_pd:
@@ -925,6 +941,7 @@ define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movehdup_ps:
@@ -954,6 +971,7 @@ define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movehdup_ps:
@@ -1049,6 +1067,7 @@ define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_moveldup_ps:
@@ -1078,6 +1097,7 @@ define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_moveldup_ps:
@@ -1173,6 +1193,7 @@ define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_epi64:
@@ -1202,6 +1223,7 @@ define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_epi64:
@@ -1245,6 +1267,7 @@ define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_pd:
@@ -1274,6 +1297,7 @@ define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_pd:
@@ -1317,6 +1341,7 @@ define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x doub
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_pd:
@@ -1346,6 +1371,7 @@ define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x dou
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_pd:
@@ -1389,6 +1415,7 @@ define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x d
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_shuffle_pd:
@@ -1418,6 +1445,7 @@ define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_shuffle_pd:
@@ -1461,6 +1489,7 @@ define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float>
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_ps:
@@ -1490,6 +1519,7 @@ define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0]
; X32-NEXT: popl %eax
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_ps:
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index b6723ee50b0..6c6fad794c8 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -4712,8 +4712,8 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32,
define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
; CHECK: ## BB#0:
-; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06]
-; CHECK-NEXT: ## ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5]
+; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03]
+; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06]
; CHECK-NEXT: ## ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5]
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 9098ca30897..35fecf8955c 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -2729,8 +2729,8 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x
; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vperm2f128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x30]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2752,7 +2752,7 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4
; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16]
+; CHECK-NEXT: vperm2f128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x30]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
@@ -2773,8 +2773,8 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vperm2i128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x46,0xc1,0x30]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
@@ -2791,7 +2791,7 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16]
; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16]
+; CHECK-NEXT: vperm2i128 $48, %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x46,0xc1,0x30]
; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
diff --git a/test/CodeGen/X86/avx512vl-vbroadcast.ll b/test/CodeGen/X86/avx512vl-vbroadcast.ll
index 9fc957297e2..1098e7bffe0 100644
--- a/test/CodeGen/X86/avx512vl-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512vl-vbroadcast.ll
@@ -12,6 +12,7 @@ define <8 x float> @_256_broadcast_ss_spill(float %x) {
; CHECK-NEXT: callq func_f32
; CHECK-NEXT: vbroadcastss (%rsp), %ymm0 # 16-byte Folded Reload
; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -30,6 +31,7 @@ define <4 x float> @_128_broadcast_ss_spill(float %x) {
; CHECK-NEXT: callq func_f32
; CHECK-NEXT: vbroadcastss (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a = fadd float %x, %x
call void @func_f32(float %a)
@@ -49,6 +51,7 @@ define <4 x double> @_256_broadcast_sd_spill(double %x) {
; CHECK-NEXT: callq func_f64
; CHECK-NEXT: vbroadcastsd (%rsp), %ymm0 # 16-byte Folded Reload
; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a = fadd double %x, %x
call void @func_f64(double %a)
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 5ee06fde127..bccf953fb0b 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -109,6 +109,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -227,6 +228,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -348,6 +350,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -470,6 +473,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -597,6 +601,7 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -720,6 +725,7 @@ define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -846,6 +852,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -973,6 +980,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1024,6 +1032,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1071,6 +1080,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1129,6 +1139,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1188,6 +1199,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1217,8 +1229,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1246,8 +1256,6 @@ define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1278,8 +1286,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1311,8 +1317,6 @@ define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -1392,6 +1396,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1465,6 +1470,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1541,6 +1547,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1618,6 +1625,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1700,6 +1708,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1778,6 +1787,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1859,6 +1869,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -1941,6 +1952,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2064,6 +2076,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2183,6 +2196,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2305,6 +2319,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2428,6 +2443,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2556,6 +2572,7 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2680,6 +2697,7 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2807,6 +2825,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -2935,6 +2954,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -3288,6 +3308,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -3552,6 +3573,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -3912,6 +3934,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -4188,6 +4211,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5051,6 +5075,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5092,6 +5117,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5153,6 +5179,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5216,6 +5243,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5263,6 +5291,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5326,6 +5355,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5379,6 +5409,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5426,6 +5457,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5493,6 +5525,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5562,6 +5595,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5615,6 +5649,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5684,6 +5719,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -5957,6 +5993,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6030,6 +6067,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6106,6 +6144,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6183,6 +6222,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6260,6 +6300,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6337,6 +6378,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6420,6 +6462,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6498,6 +6541,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6579,6 +6623,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6661,6 +6706,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6743,6 +6789,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6825,6 +6872,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -6946,6 +6994,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7062,6 +7111,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7181,6 +7231,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7301,6 +7352,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7421,6 +7473,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7541,6 +7594,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7667,6 +7721,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7788,6 +7843,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -7912,6 +7968,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -8037,6 +8094,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -8162,6 +8220,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -8287,6 +8346,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9131,6 +9191,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9172,6 +9233,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9225,6 +9287,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9280,6 +9343,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9327,6 +9391,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9382,6 +9447,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9435,6 +9501,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9482,6 +9549,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9541,6 +9609,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9602,6 +9671,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9655,6 +9725,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -9716,6 +9787,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10607,6 +10679,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10650,6 +10723,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10713,6 +10787,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10778,6 +10853,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10827,6 +10903,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10892,6 +10969,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10947,6 +11025,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -10996,6 +11075,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11065,6 +11145,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11136,6 +11217,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11191,6 +11273,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11262,6 +11345,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11509,6 +11593,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11580,6 +11665,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11654,6 +11740,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11729,6 +11816,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11804,6 +11892,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11879,6 +11968,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -11960,6 +12050,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12036,6 +12127,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12115,6 +12207,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12195,6 +12288,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12275,6 +12369,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12355,6 +12450,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12478,6 +12574,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12596,6 +12693,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12717,6 +12815,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12839,6 +12938,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -12966,6 +13066,7 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13089,6 +13190,7 @@ define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13215,6 +13317,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13342,6 +13445,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13393,6 +13497,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13440,6 +13545,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13498,6 +13604,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13557,6 +13664,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13586,8 +13694,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13615,8 +13721,6 @@ define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13647,8 +13751,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13680,8 +13782,6 @@ define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -13761,6 +13861,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13834,6 +13935,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13910,6 +14012,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -13987,6 +14090,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14069,6 +14173,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14147,6 +14252,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14228,6 +14334,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14310,6 +14417,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14433,6 +14541,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14552,6 +14661,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14674,6 +14784,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14797,6 +14908,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -14925,6 +15037,7 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15049,6 +15162,7 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15176,6 +15290,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15304,6 +15419,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15657,6 +15773,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -15921,6 +16038,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -16281,6 +16399,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -16557,6 +16676,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17420,6 +17540,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17461,6 +17582,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17522,6 +17644,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17585,6 +17708,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17632,6 +17756,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17695,6 +17820,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17748,6 +17874,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17795,6 +17922,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17862,6 +17990,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17931,6 +18060,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -17984,6 +18114,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18053,6 +18184,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18326,6 +18458,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18399,6 +18532,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18475,6 +18609,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18552,6 +18687,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18629,6 +18765,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18706,6 +18843,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18789,6 +18927,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18867,6 +19006,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -18948,6 +19088,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19030,6 +19171,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19112,6 +19254,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19194,6 +19337,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19315,6 +19459,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19431,6 +19576,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19550,6 +19696,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19670,6 +19817,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19790,6 +19938,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -19910,6 +20059,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20036,6 +20186,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20157,6 +20308,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20281,6 +20433,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20406,6 +20559,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20531,6 +20685,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -20656,6 +20811,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21500,6 +21656,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21541,6 +21698,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21594,6 +21752,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21649,6 +21808,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21696,6 +21856,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21751,6 +21912,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21804,6 +21966,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21851,6 +22014,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21910,6 +22074,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -21971,6 +22136,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -22024,6 +22190,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -22085,6 +22252,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -22976,6 +23144,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23019,6 +23188,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23082,6 +23252,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23147,6 +23318,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23196,6 +23368,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23261,6 +23434,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23316,6 +23490,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23365,6 +23540,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23434,6 +23610,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23505,6 +23682,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23560,6 +23738,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23631,6 +23810,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23878,6 +24058,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -23949,6 +24130,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24023,6 +24205,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24098,6 +24281,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24173,6 +24357,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24248,6 +24433,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24329,6 +24515,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24405,6 +24592,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24484,6 +24672,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24564,6 +24753,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24644,6 +24834,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24724,6 +24915,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24849,6 +25041,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -24970,6 +25163,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25093,6 +25287,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25218,6 +25413,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25347,6 +25543,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25473,6 +25670,7 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25601,6 +25799,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25731,6 +25930,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25784,6 +25984,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25834,6 +26035,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25894,6 +26096,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25956,6 +26159,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -25987,8 +26191,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26019,8 +26221,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26053,8 +26253,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26089,8 +26287,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -26172,6 +26368,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26248,6 +26445,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26326,6 +26524,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26406,6 +26605,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26490,6 +26690,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26571,6 +26772,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26654,6 +26856,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26739,6 +26942,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26864,6 +27068,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -26986,6 +27191,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27110,6 +27316,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27236,6 +27443,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27366,6 +27574,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27493,6 +27702,7 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27622,6 +27832,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -27753,6 +27964,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -28109,6 +28321,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -28378,6 +28591,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -28741,6 +28955,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -29022,6 +29237,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -29903,6 +30119,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -29947,6 +30164,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30008,6 +30226,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30072,6 +30291,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30121,6 +30341,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30184,6 +30405,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30239,6 +30461,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30289,6 +30512,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30356,6 +30580,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30426,6 +30651,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30481,6 +30707,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30550,6 +30777,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30823,6 +31051,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30896,6 +31125,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -30972,6 +31202,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31049,6 +31280,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31126,6 +31358,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31203,6 +31436,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31286,6 +31520,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31364,6 +31599,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31445,6 +31681,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31527,6 +31764,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31609,6 +31847,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31691,6 +31930,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31812,6 +32052,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -31928,6 +32169,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32047,6 +32289,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32167,6 +32410,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32287,6 +32531,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32407,6 +32652,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32533,6 +32779,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32654,6 +32901,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32778,6 +33026,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -32903,6 +33152,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -33028,6 +33278,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -33153,6 +33404,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34023,6 +34275,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34067,6 +34320,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34120,6 +34374,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34176,6 +34431,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34225,6 +34481,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34280,6 +34537,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34335,6 +34593,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34385,6 +34644,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34444,6 +34704,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34506,6 +34767,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34561,6 +34823,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -34622,6 +34885,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35543,6 +35807,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35589,6 +35854,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35654,6 +35920,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35722,6 +35989,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35773,6 +36041,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35840,6 +36109,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35897,6 +36167,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -35949,6 +36220,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36020,6 +36292,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36094,6 +36367,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36151,6 +36425,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36224,6 +36499,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36471,6 +36747,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36542,6 +36819,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36616,6 +36894,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36691,6 +36970,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36766,6 +37046,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36841,6 +37122,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36922,6 +37204,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -36998,6 +37281,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37077,6 +37361,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37157,6 +37442,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37237,6 +37523,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37317,6 +37604,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37443,6 +37731,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37564,6 +37853,7 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37688,6 +37978,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37813,6 +38104,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -37943,6 +38235,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38069,6 +38362,7 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38198,6 +38492,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38328,6 +38623,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38382,6 +38678,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38432,6 +38729,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38493,6 +38791,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38555,6 +38854,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38587,8 +38887,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38619,8 +38917,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38654,8 +38950,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38690,8 +38984,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; NoVLX-NEXT: vzeroupper
@@ -38774,6 +39066,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38850,6 +39143,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -38929,6 +39223,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39009,6 +39304,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39094,6 +39390,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39175,6 +39472,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39259,6 +39557,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39344,6 +39643,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39470,6 +39770,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39592,6 +39893,7 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39717,6 +40019,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39843,6 +40146,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -39974,6 +40278,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40101,6 +40406,7 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40231,6 +40537,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40362,6 +40669,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40720,6 +41028,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -40989,6 +41298,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -41354,6 +41664,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -41635,6 +41946,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42537,6 +42849,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42581,6 +42894,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42645,6 +42959,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42711,6 +43026,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42761,6 +43077,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42827,6 +43144,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42883,6 +43201,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -42933,6 +43252,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43003,6 +43323,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43075,6 +43396,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43131,6 +43453,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43203,6 +43526,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43476,6 +43800,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43549,6 +43874,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43625,6 +43951,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43702,6 +44029,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43779,6 +44107,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43856,6 +44185,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -43939,6 +44269,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44017,6 +44348,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44098,6 +44430,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44180,6 +44513,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44262,6 +44596,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44344,6 +44679,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44465,6 +44801,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44581,6 +44918,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44700,6 +45038,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44820,6 +45159,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -44940,6 +45280,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45060,6 +45401,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45186,6 +45528,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45307,6 +45650,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45431,6 +45775,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45556,6 +45901,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45681,6 +46027,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -45806,6 +46153,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46707,6 +47055,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46751,6 +47100,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46807,6 +47157,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46865,6 +47216,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46915,6 +47267,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -46973,6 +47326,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47029,6 +47383,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47079,6 +47434,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47141,6 +47497,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47205,6 +47562,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47261,6 +47619,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -47325,6 +47684,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48255,6 +48615,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48301,6 +48662,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48367,6 +48729,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48435,6 +48798,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48487,6 +48851,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48555,6 +48920,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48613,6 +48979,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48665,6 +49032,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48737,6 +49105,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48811,6 +49180,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48869,6 +49239,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -48943,6 +49314,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49190,6 +49562,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49261,6 +49634,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49335,6 +49709,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49410,6 +49785,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49485,6 +49861,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49560,6 +49937,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49641,6 +50019,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49717,6 +50096,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49796,6 +50176,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49876,6 +50257,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -49956,6 +50338,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50036,6 +50419,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50829,6 +51213,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50870,6 +51255,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50913,6 +51299,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -50964,6 +51351,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51015,6 +51403,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51068,6 +51457,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51121,6 +51511,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51168,6 +51559,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51217,6 +51609,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51274,6 +51667,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51331,6 +51725,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51390,6 +51785,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51663,6 +52059,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51736,6 +52133,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51810,6 +52208,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51887,6 +52286,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -51964,6 +52364,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52042,6 +52443,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52126,6 +52528,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52204,6 +52607,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52283,6 +52687,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52365,6 +52770,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52447,6 +52853,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52530,6 +52937,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52652,6 +53060,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52768,6 +53177,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -52885,6 +53295,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53005,6 +53416,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53125,6 +53537,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53246,6 +53659,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53414,6 +53828,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53535,6 +53950,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53657,6 +54073,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53782,6 +54199,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -53907,6 +54325,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54033,6 +54452,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: popq %r14
; NoVLX-NEXT: popq %r15
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54886,6 +55306,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54927,6 +55348,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -54970,6 +55392,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55020,6 +55443,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55070,6 +55494,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55122,6 +55547,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55175,6 +55601,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55222,6 +55649,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55271,6 +55699,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55327,6 +55756,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55383,6 +55813,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -55441,6 +55872,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56260,6 +56692,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56303,6 +56736,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56348,6 +56782,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56401,6 +56836,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56454,6 +56890,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56509,6 +56946,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56564,6 +57002,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56613,6 +57052,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56664,6 +57104,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56723,6 +57164,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56782,6 +57224,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -56843,6 +57286,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57146,6 +57590,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57217,6 +57662,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57289,6 +57735,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57364,6 +57811,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57439,6 +57887,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57515,6 +57964,7 @@ define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57647,6 +58097,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57723,6 +58174,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57800,6 +58252,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57880,6 +58333,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -57960,6 +58414,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
@@ -58041,6 +58496,7 @@ define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: .cfi_def_cfa %rsp, 8
; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/avx512vl-vec-test-testn.ll b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
index f1919cb118c..32de0254efa 100644
--- a/test/CodeGen/X86/avx512vl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
@@ -6,18 +6,14 @@
define zeroext i8 @TEST_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqq %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqq %xmm1, %xmm0, %k0
+; I386-NEXT: vptestmq %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -33,18 +29,14 @@ entry:
define zeroext i8 @TEST_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqd %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqd %xmm1, %xmm0, %k0
+; I386-NEXT: vptestmd %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -61,9 +53,7 @@ entry:
define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqq %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -71,9 +61,7 @@ define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) lo
;
; I386-LABEL: TEST_mm256_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqq %ymm1, %ymm0, %k0
+; I386-NEXT: vptestmq %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -90,9 +78,7 @@ entry:
define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -100,9 +86,7 @@ define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) lo
;
; I386-LABEL: TEST_mm256_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpneqd %ymm1, %ymm0, %k0
+; I386-NEXT: vptestmd %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -119,21 +103,17 @@ entry:
define zeroext i8 @TEST_mm_mask_test_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqq %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -152,21 +132,17 @@ entry:
define zeroext i8 @TEST_mm_mask_test_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -187,10 +163,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_test_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -198,11 +172,9 @@ define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x
;
; I386-LABEL: TEST_mm256_mask_test_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqq %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -222,10 +194,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_test_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -233,11 +203,9 @@ define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x
;
; I386-LABEL: TEST_mm256_mask_test_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpneqd %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -256,18 +224,14 @@ entry:
define zeroext i8 @TEST_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
+; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -283,18 +247,14 @@ entry:
define zeroext i8 @TEST_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -311,9 +271,7 @@ entry:
define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -321,9 +279,7 @@ define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) l
;
; I386-LABEL: TEST_mm256_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -340,9 +296,7 @@ entry:
define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86_64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -350,9 +304,7 @@ define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) l
;
; I386-LABEL: TEST_mm256_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; I386-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -369,21 +321,17 @@ entry:
define zeroext i8 @TEST_mm_mask_testn_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -402,21 +350,17 @@ entry:
define zeroext i8 @TEST_mm_mask_testn_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm_mask_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: retq
;
; I386-LABEL: TEST_mm_mask_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %xmm0, %xmm1, %xmm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: retl
@@ -437,10 +381,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_testn_epi64_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -448,11 +390,9 @@ define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4
;
; I386-LABEL: TEST_mm256_mask_testn_epi64_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
@@ -472,10 +412,8 @@ entry:
define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
; X86_64-LABEL: TEST_mm256_mask_testn_epi32_mask:
; X86_64: # BB#0: # %entry
-; X86_64-NEXT: vpand %ymm0, %ymm1, %ymm0
-; X86_64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86_64-NEXT: kmovw %edi, %k1
-; X86_64-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
; X86_64-NEXT: kmovw %k0, %eax
; X86_64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; X86_64-NEXT: vzeroupper
@@ -483,11 +421,9 @@ define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4
;
; I386-LABEL: TEST_mm256_mask_testn_epi32_mask:
; I386: # BB#0: # %entry
-; I386-NEXT: vpand %ymm0, %ymm1, %ymm0
-; I386-NEXT: vpxor %xmm1, %xmm1, %xmm1
; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; I386-NEXT: kmovw %eax, %k1
-; I386-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
; I386-NEXT: kmovw %k0, %eax
; I386-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
; I386-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll
new file mode 100644
index 00000000000..ab4cbeb8d5e
--- /dev/null
+++ b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s
+
+define <2 x i64> @test_mm_broadcastmb_epi64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_mm_broadcastmb_epi64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %a to <4 x i32>
+ %1 = bitcast <2 x i64> %b to <4 x i32>
+ %2 = icmp eq <4 x i32> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <8 x i1> %3 to i8
+ %conv.i = zext i8 %4 to i64
+ %vecinit.i.i = insertelement <2 x i64> undef, i64 %conv.i, i32 0
+ %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %vecinit1.i.i
+}
+
+define <4 x i64> @test_mm256_broadcastmb_epi64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_mm256_broadcastmb_epi64:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = icmp eq <4 x i64> %a, %b
+ %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <8 x i1> %1 to i8
+ %conv.i = zext i8 %2 to i64
+ %vecinit.i.i = insertelement <4 x i64> undef, i64 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %vecinit3.i.i
+}
+
+define <2 x i64> @test_mm_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm_broadcastmw_epi32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %4 = bitcast <4 x i32> %vecinit3.i.i to <2 x i64>
+ ret <2 x i64> %4
+}
+
+define <4 x i64> @test_mm256_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm256_broadcastmw_epi32:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <8 x i32> undef, i32 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %4 = bitcast <8 x i32> %vecinit7.i.i to <4 x i64>
+ ret <4 x i64> %4
+}
+
+
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
index e197713c679..c48222000c6 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -439,6 +439,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll
index f6cfbbb4044..f5fe395eaf3 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -594,6 +594,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -1239,6 +1240,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: .cfi_def_cfa %rsp, 8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1457,6 +1459,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa %rsp, 8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1499,6 +1502,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 4ed55ac0919..1959000b859 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -321,11 +321,17 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
; AVX512-NEXT: vpinsrb $15, %r9d, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: .cfi_def_cfa_offset 48
; AVX512-NEXT: popq %r12
+; AVX512-NEXT: .cfi_def_cfa_offset 40
; AVX512-NEXT: popq %r13
+; AVX512-NEXT: .cfi_def_cfa_offset 32
; AVX512-NEXT: popq %r14
+; AVX512-NEXT: .cfi_def_cfa_offset 24
; AVX512-NEXT: popq %r15
+; AVX512-NEXT: .cfi_def_cfa_offset 16
; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: .cfi_def_cfa_offset 8
; AVX512-NEXT: retq
%1 = bitcast i16 %a0 to <16 x i1>
%2 = zext <16 x i1> %1 to <16 x i8>
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
index ee2dac1d466..76160517546 100644
--- a/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -204,6 +204,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll
index 2b73c6e16bd..ef981080bb3 100644
--- a/test/CodeGen/X86/bitcast-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -203,6 +203,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX512F-NEXT: movl (%rsp), %eax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -769,6 +770,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: orq %rcx, %rax
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: .cfi_def_cfa %rsp, 8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -983,6 +985,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX2-NEXT: orq %rcx, %rax
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa %rsp, 8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1021,6 +1024,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
diff --git a/test/CodeGen/X86/bool-vector.ll b/test/CodeGen/X86/bool-vector.ll
index eb40744c54d..692d992df76 100644
--- a/test/CodeGen/X86/bool-vector.ll
+++ b/test/CodeGen/X86/bool-vector.ll
@@ -93,6 +93,7 @@ define i32 @PR15215_good(<4 x i32> %input) {
; X32-NEXT: leal (%eax,%edx,4), %eax
; X32-NEXT: leal (%eax,%esi,8), %eax
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-SSE2-LABEL: PR15215_good:
@@ -115,6 +116,7 @@ define i32 @PR15215_good(<4 x i32> %input) {
; X32-SSE2-NEXT: leal (%eax,%edx,4), %eax
; X32-SSE2-NEXT: leal (%eax,%esi,8), %eax
; X32-SSE2-NEXT: popl %esi
+; X32-SSE2-NEXT: .cfi_def_cfa_offset 4
; X32-SSE2-NEXT: retl
;
; X32-AVX2-LABEL: PR15215_good:
@@ -134,6 +136,7 @@ define i32 @PR15215_good(<4 x i32> %input) {
; X32-AVX2-NEXT: leal (%eax,%edx,4), %eax
; X32-AVX2-NEXT: leal (%eax,%esi,8), %eax
; X32-AVX2-NEXT: popl %esi
+; X32-AVX2-NEXT: .cfi_def_cfa_offset 4
; X32-AVX2-NEXT: retl
;
; X64-LABEL: PR15215_good:
diff --git a/test/CodeGen/X86/broadcastm-lowering.ll b/test/CodeGen/X86/broadcastm-lowering.ll
index 2a8236cf093..fc7b192c2f8 100644
--- a/test/CodeGen/X86/broadcastm-lowering.ll
+++ b/test/CodeGen/X86/broadcastm-lowering.ll
@@ -80,8 +80,7 @@ define <16 x i32> @test_mm512_epi32(<16 x i32> %a, <16 x i32> %b) {
; AVX512CD-LABEL: test_mm512_epi32:
; AVX512CD: # BB#0: # %entry
; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512CD-NEXT: kmovw %k0, %eax
-; AVX512CD-NEXT: vpbroadcastd %eax, %zmm0
+; AVX512CD-NEXT: vpbroadcastmw2d %k0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512VLCDBW-LABEL: test_mm512_epi32:
@@ -110,9 +109,7 @@ define <8 x i64> @test_mm512_epi64(<8 x i32> %a, <8 x i32> %b) {
; AVX512CD-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512CD-NEXT: kmovw %k0, %eax
-; AVX512CD-NEXT: movzbl %al, %eax
-; AVX512CD-NEXT: vpbroadcastq %rax, %zmm0
+; AVX512CD-NEXT: vpbroadcastmb2q %k0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512VLCDBW-LABEL: test_mm512_epi64:
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 82e133d2576..6f9abae6a71 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -247,10 +247,13 @@ define i32 @test12() ssp uwtable {
; CHECK-NEXT: # BB#1: # %T
; CHECK-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
; CHECK-NEXT: popq %rcx # encoding: [0x59]
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq # encoding: [0xc3]
; CHECK-NEXT: .LBB12_2: # %F
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
; CHECK-NEXT: popq %rcx # encoding: [0x59]
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%tmp1 = call zeroext i1 @test12b()
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 9f7f8a97dc2..c5f03dbd5a3 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -175,7 +175,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
; SSE: # BB#0:
; SSE-NEXT: psrlq $48, %xmm1
; SSE-NEXT: psrlq $48, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: packusdw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr0:
diff --git a/test/CodeGen/X86/compress_expand.ll b/test/CodeGen/X86/compress_expand.ll
index c6a1c07922e..9237544ea95 100644
--- a/test/CodeGen/X86/compress_expand.ll
+++ b/test/CodeGen/X86/compress_expand.ll
@@ -140,9 +140,7 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
-; KNL-NEXT: kshiftlw $8, %k0, %k0
-; KNL-NEXT: kshiftrw $8, %k0, %k1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
; KNL-NEXT: retq
call void @llvm.masked.compressstore.v8f32(<8 x float> %V, float* %base, <8 x i1> %mask)
diff --git a/test/CodeGen/X86/emutls-pie.ll b/test/CodeGen/X86/emutls-pie.ll
index 3c312a92669..f4561fcbd35 100644
--- a/test/CodeGen/X86/emutls-pie.ll
+++ b/test/CodeGen/X86/emutls-pie.ll
@@ -18,13 +18,16 @@ define i32 @my_get_xyz() {
; X32-NEXT: calll my_emutls_get_address@PLT
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: my_get_xyz:
; X64: movq my_emutls_v_xyz@GOTPCREL(%rip), %rdi
; X64-NEXT: callq my_emutls_get_address@PLT
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -44,13 +47,16 @@ define i32 @f1() {
; X32-NEXT: calll __emutls_get_address@PLT
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $8, %esp
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: f1:
; X64: leaq __emutls_v.i(%rip), %rdi
; X64-NEXT: callq __emutls_get_address@PLT
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/emutls.ll b/test/CodeGen/X86/emutls.ll
index 8c0ba903659..2321cd2fc28 100644
--- a/test/CodeGen/X86/emutls.ll
+++ b/test/CodeGen/X86/emutls.ll
@@ -16,12 +16,14 @@ define i32 @my_get_xyz() {
; X32-NEXT: calll my_emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: my_get_xyz:
; X64: movl $my_emutls_v_xyz, %edi
; X64-NEXT: callq my_emutls_get_address
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -45,12 +47,14 @@ define i32 @f1() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: f1:
; X64: movl $__emutls_v.i1, %edi
; X64-NEXT: callq __emutls_get_address
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -63,11 +67,13 @@ define i32* @f2() {
; X32: movl $__emutls_v.i1, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
; X64-LABEL: f2:
; X64: movl $__emutls_v.i1, %edi
; X64-NEXT: callq __emutls_get_address
; X64-NEXT: popq %rcx
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
@@ -92,6 +98,7 @@ define i32* @f4() {
; X32: movl $__emutls_v.i2, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -116,6 +123,7 @@ define i32* @f6() {
; X32: movl $__emutls_v.i3, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -128,6 +136,7 @@ define i32 @f7() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -140,6 +149,7 @@ define i32* @f8() {
; X32: movl $__emutls_v.i4, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -152,6 +162,7 @@ define i32 @f9() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -164,6 +175,7 @@ define i32* @f10() {
; X32: movl $__emutls_v.i5, (%esp)
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -176,6 +188,7 @@ define i16 @f11() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movzwl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -189,6 +202,7 @@ define i32 @f12() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movswl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -203,6 +217,7 @@ define i8 @f13() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movb (%eax), %al
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
@@ -216,6 +231,7 @@ define i32 @f14() {
; X32-NEXT: calll __emutls_get_address
; X32-NEXT: movsbl (%eax), %eax
; X32-NEXT: addl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
entry:
diff --git a/test/CodeGen/X86/epilogue-cfi-fp.ll b/test/CodeGen/X86/epilogue-cfi-fp.ll
new file mode 100644
index 00000000000..c2fe1c7eaac
--- /dev/null
+++ b/test/CodeGen/X86/epilogue-cfi-fp.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 %s -o - | FileCheck %s
+
+; ModuleID = 'epilogue-cfi-fp.c'
+source_filename = "epilogue-cfi-fp.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i686-pc-linux"
+
+; Function Attrs: noinline nounwind
+define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) #0 {
+
+; CHECK-LABEL: foo:
+; CHECK: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa %esp, 4
+; CHECK-NEXT: retl
+
+entry:
+ %i.addr = alloca i32, align 4
+ %j.addr = alloca i32, align 4
+ %k.addr = alloca i32, align 4
+ %l.addr = alloca i32, align 4
+ %m.addr = alloca i32, align 4
+ store i32 %i, i32* %i.addr, align 4
+ store i32 %j, i32* %j.addr, align 4
+ store i32 %k, i32* %k.addr, align 4
+ store i32 %l, i32* %l.addr, align 4
+ store i32 %m, i32* %m.addr, align 4
+ ret i32 0
+}
+
+attributes #0 = { "no-frame-pointer-elim"="true" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 3f8116e6a2815b1d5f3491493938d0c63c9f42c9) (http://llvm.org/git/llvm.git 4fde77f8f1a8e4482e69b6a7484bc7d1b99b3c0a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "epilogue-cfi-fp.c", directory: "epilogue-dwarf/test")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+
diff --git a/test/CodeGen/X86/epilogue-cfi-no-fp.ll b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
new file mode 100644
index 00000000000..79d6f478de8
--- /dev/null
+++ b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
@@ -0,0 +1,46 @@
+; RUN: llc -O0 < %s | FileCheck %s
+
+; ModuleID = 'epilogue-cfi-no-fp.c'
+source_filename = "epilogue-cfi-no-fp.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i686-pc-linux"
+
+; Function Attrs: noinline nounwind
+define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) {
+; CHECK-LABEL: foo:
+; CHECK: addl $20, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 12
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: .cfi_def_cfa_offset 4
+; CHECK-NEXT: retl
+entry:
+ %i.addr = alloca i32, align 4
+ %j.addr = alloca i32, align 4
+ %k.addr = alloca i32, align 4
+ %l.addr = alloca i32, align 4
+ %m.addr = alloca i32, align 4
+ store i32 %i, i32* %i.addr, align 4
+ store i32 %j, i32* %j.addr, align 4
+ store i32 %k, i32* %k.addr, align 4
+ store i32 %l, i32* %l.addr, align 4
+ store i32 %m, i32* %m.addr, align 4
+ ret i32 0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 3f8116e6a2815b1d5f3491493938d0c63c9f42c9) (http://llvm.org/git/llvm.git 4fde77f8f1a8e4482e69b6a7484bc7d1b99b3c0a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "epilogue-cfi-no-fp.c", directory: "epilogue-dwarf/test")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+
+
diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll
index 712fe810d2a..64f8fd0ca8d 100644
--- a/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/test/CodeGen/X86/f16c-intrinsics.ll
@@ -1,33 +1,81 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64-AVX512VL
define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
; X32-LABEL: test_x86_vcvtph2ps_128:
; X32: # BB#0:
-; X32-NEXT: vcvtph2ps %xmm0, %xmm0
-; X32-NEXT: retl
+; X32-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_128:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps %xmm0, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
+define <4 x float> @test_x86_vcvtph2ps_128_m(<8 x i16>* nocapture %a) {
+; X32-LABEL: test_x86_vcvtph2ps_128_m:
+; X32: # BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_128_m:
+; X64: # BB#0:
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
+ %load = load <8 x i16>, <8 x i16>* %a
+ %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %load) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
; X32-LABEL: test_x86_vcvtph2ps_256:
; X32: # BB#0:
-; X32-NEXT: vcvtph2ps %xmm0, %ymm0
-; X32-NEXT: retl
+; X32-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_256:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps %xmm0, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -36,15 +84,26 @@ declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
; X32-LABEL: test_x86_vcvtph2ps_256_m:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %ymm0
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_256_m:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %ymm0
-; X64-NEXT: retq
- %load = load <8 x i16>, <8 x i16>* %a, align 16
+; X64-NEXT: vcvtph2ps (%rdi), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
+ %load = load <8 x i16>, <8 x i16>* %a
%res = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %load)
ret <8 x float> %res
}
@@ -52,13 +111,23 @@ define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
; X32-LABEL: test_x86_vcvtps2ph_128:
; X32: # BB#0:
-; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; X32-NEXT: retl
+; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128:
; X64: # BB#0:
-; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -67,15 +136,27 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
; X32-LABEL: test_x86_vcvtps2ph_256:
; X32: # BB#0:
-; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_256:
; X64: # BB#0:
-; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -84,14 +165,25 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
; X32-LABEL: test_x86_vcvtps2ph_128_scalar:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %xmm0
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_scalar:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%load = load i64, i64* %ptr
%ins1 = insertelement <2 x i64> undef, i64 %load, i32 0
%ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1
@@ -103,14 +195,25 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
; X32-LABEL: test_x86_vcvtps2ph_128_scalar2:
; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %xmm0
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_scalar2:
; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; X32-AVX512VL: # BB#0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; X64-AVX512VL: # BB#0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%load = load i64, i64* %ptr
%ins = insertelement <2 x i64> undef, i64 %load, i32 0
%bc = bitcast <2 x i64> %ins to <8 x i16>
@@ -121,16 +224,29 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) nounwind {
; X32-LABEL: test_x86_vcvtps2ph_256_m:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax)
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax) # encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_256_m:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi)
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a, i32 3)
store <8 x i16> %0, <8 x i16>* %d, align 16
@@ -140,14 +256,31 @@ entry:
define void @test_x86_vcvtps2ph_128_m(<4 x i16>* nocapture %d, <4 x float> %a) nounwind {
; X32-LABEL: test_x86_vcvtps2ph_128_m:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
+; X32-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; X32-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-AVX512VL-NEXT: vpmovdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
+; X64-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; X64-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX512VL-NEXT: vpmovdw %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a, i32 3)
%1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -158,14 +291,25 @@ entry:
define void @test_x86_vcvtps2ph_128_m2(double* nocapture %hf4x16, <4 x float> %f4x32) #0 {
; X32-LABEL: test_x86_vcvtps2ph_128_m2:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m2:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
%1 = bitcast <8 x i16> %0 to <2 x double>
@@ -177,14 +321,25 @@ entry:
define void @test_x86_vcvtps2ph_128_m3(i64* nocapture %hf4x16, <4 x float> %f4x32) #0 {
; X32-LABEL: test_x86_vcvtps2ph_128_m3:
; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m3:
; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
+; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
+; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
%1 = bitcast <8 x i16> %0 to <2 x i64>
diff --git a/test/CodeGen/X86/fast-isel-int-float-conversion.ll b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
index 3e69710868b..57b50abab53 100644
--- a/test/CodeGen/X86/fast-isel-int-float-conversion.ll
+++ b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
@@ -31,6 +31,7 @@ define double @int_to_double_rr(i32 %a) {
; SSE2_X86-NEXT: fldl (%esp)
; SSE2_X86-NEXT: movl %ebp, %esp
; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_double_rr:
@@ -47,6 +48,7 @@ define double @int_to_double_rr(i32 %a) {
; AVX_X86-NEXT: fldl (%esp)
; AVX_X86-NEXT: movl %ebp, %esp
; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa %esp, 4
; AVX_X86-NEXT: retl
entry:
%0 = sitofp i32 %a to double
@@ -80,6 +82,7 @@ define double @int_to_double_rm(i32* %a) {
; SSE2_X86-NEXT: fldl (%esp)
; SSE2_X86-NEXT: movl %ebp, %esp
; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_double_rm:
@@ -97,6 +100,7 @@ define double @int_to_double_rm(i32* %a) {
; AVX_X86-NEXT: fldl (%esp)
; AVX_X86-NEXT: movl %ebp, %esp
; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa %esp, 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
@@ -130,6 +134,7 @@ define double @int_to_double_rm_optsize(i32* %a) optsize {
; SSE2_X86-NEXT: fldl (%esp)
; SSE2_X86-NEXT: movl %ebp, %esp
; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa %esp, 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_double_rm_optsize:
@@ -147,6 +152,7 @@ define double @int_to_double_rm_optsize(i32* %a) optsize {
; AVX_X86-NEXT: fldl (%esp)
; AVX_X86-NEXT: movl %ebp, %esp
; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa %esp, 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
@@ -174,6 +180,7 @@ define float @int_to_float_rr(i32 %a) {
; SSE2_X86-NEXT: movss %xmm0, (%esp)
; SSE2_X86-NEXT: flds (%esp)
; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_float_rr:
@@ -184,6 +191,7 @@ define float @int_to_float_rr(i32 %a) {
; AVX_X86-NEXT: vmovss %xmm0, (%esp)
; AVX_X86-NEXT: flds (%esp)
; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 4
; AVX_X86-NEXT: retl
entry:
%0 = sitofp i32 %a to float
@@ -211,6 +219,7 @@ define float @int_to_float_rm(i32* %a) {
; SSE2_X86-NEXT: movss %xmm0, (%esp)
; SSE2_X86-NEXT: flds (%esp)
; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_float_rm:
@@ -222,6 +231,7 @@ define float @int_to_float_rm(i32* %a) {
; AVX_X86-NEXT: vmovss %xmm0, (%esp)
; AVX_X86-NEXT: flds (%esp)
; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
@@ -249,6 +259,7 @@ define float @int_to_float_rm_optsize(i32* %a) optsize {
; SSE2_X86-NEXT: movss %xmm0, (%esp)
; SSE2_X86-NEXT: flds (%esp)
; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 4
; SSE2_X86-NEXT: retl
;
; AVX_X86-LABEL: int_to_float_rm_optsize:
@@ -260,6 +271,7 @@ define float @int_to_float_rm_optsize(i32* %a) optsize {
; AVX_X86-NEXT: vmovss %xmm0, (%esp)
; AVX_X86-NEXT: flds (%esp)
; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 4
; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
diff --git a/test/CodeGen/X86/fast-isel-store.ll b/test/CodeGen/X86/fast-isel-store.ll
index e359e620563..e2412e9c5c0 100644
--- a/test/CodeGen/X86/fast-isel-store.ll
+++ b/test/CodeGen/X86/fast-isel-store.ll
@@ -375,6 +375,7 @@ define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double
; SSE64-NEXT: movupd %xmm0, (%eax)
; SSE64-NEXT: movupd %xmm1, 16(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf64:
@@ -413,6 +414,7 @@ define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4
; SSE64-NEXT: movapd %xmm0, (%eax)
; SSE64-NEXT: movapd %xmm1, 16(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf64_aligned:
@@ -452,6 +454,7 @@ define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %va
; SSE64-NEXT: movups %xmm2, 32(%eax)
; SSE64-NEXT: movups %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xi32:
@@ -501,6 +504,7 @@ define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x
; SSE64-NEXT: movaps %xmm2, 32(%eax)
; SSE64-NEXT: movaps %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xi32_aligned:
@@ -550,6 +554,7 @@ define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x floa
; SSE64-NEXT: movups %xmm2, 32(%eax)
; SSE64-NEXT: movups %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xf32:
@@ -599,6 +604,7 @@ define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <1
; SSE64-NEXT: movaps %xmm2, 32(%eax)
; SSE64-NEXT: movaps %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xf32_aligned:
@@ -656,6 +662,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; SSE64-NEXT: movupd %xmm2, 32(%eax)
; SSE64-NEXT: movupd %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_8xf64:
@@ -682,6 +689,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax)
; AVXONLY64-NEXT: movl %ebp, %esp
; AVXONLY64-NEXT: popl %ebp
+; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_8xf64:
@@ -729,6 +737,7 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; SSE64-NEXT: movapd %xmm2, 32(%eax)
; SSE64-NEXT: movapd %xmm3, 48(%eax)
; SSE64-NEXT: addl $12, %esp
+; SSE64-NEXT: .cfi_def_cfa_offset 4
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_8xf64_aligned:
@@ -755,6 +764,7 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax)
; AVXONLY64-NEXT: movl %ebp, %esp
; AVXONLY64-NEXT: popl %ebp
+; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_8xf64_aligned:
diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll
index 68f39469a82..362864f72a9 100644
--- a/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -1,29 +1,32 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
-; RUN: llc < %s -mtriple=x86_64-pc-windows -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma4,-fma -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
; VFMADD
define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6a,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -31,21 +34,27 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xa9,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6a,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -54,20 +63,25 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6b,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -75,21 +89,27 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6b,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -98,20 +118,25 @@ declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x do
define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x68,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -120,20 +145,25 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x69,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -142,20 +172,25 @@ declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x do
define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x68,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -164,20 +199,25 @@ declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -187,20 +227,25 @@ declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -208,21 +253,27 @@ define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xab,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6e,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -231,20 +282,25 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -252,21 +308,27 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xab,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6f,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -275,20 +337,25 @@ declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x do
define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -297,20 +364,25 @@ declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float
define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -319,20 +391,25 @@ declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x do
define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -341,20 +418,25 @@ declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -364,20 +446,25 @@ declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7a,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -385,21 +472,27 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xad,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7a,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -408,20 +501,25 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7b,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -429,21 +527,27 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xad,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7b,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -452,20 +556,25 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x78,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -474,20 +583,25 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x79,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -496,20 +610,25 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x d
define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x78,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -518,20 +637,25 @@ declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x
define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x79,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -541,20 +665,25 @@ declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -562,21 +691,27 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xaf,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_ss:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_ss:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7e,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
@@ -585,20 +720,25 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -606,21 +746,27 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_sd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_sd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x7f,0xc2,0x00]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
@@ -629,20 +775,25 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -651,20 +802,25 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -673,20 +829,25 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x d
define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -695,20 +856,25 @@ declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x
define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -718,20 +884,25 @@ declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -740,20 +911,25 @@ declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x fl
define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -762,20 +938,25 @@ declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x
define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5c,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -784,20 +965,25 @@ declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8
define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmaddsub_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5d,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -807,20 +993,25 @@ declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>,
define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
@@ -829,20 +1020,25 @@ declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x fl
define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
@@ -851,20 +1047,25 @@ declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x
define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_ps_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5e,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
@@ -873,20 +1074,25 @@ declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8
define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA: # BB#0:
-; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
-; CHECK-FMA-NEXT: retq
+; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-AVX512VL: # BB#0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA-WIN: # BB#0:
-; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1
-; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0
-; CHECK-FMA-WIN-NEXT: retq
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
;
; CHECK-FMA4-LABEL: test_x86_fma_vfmsubadd_pd_256:
; CHECK-FMA4: # BB#0:
-; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-FMA4-NEXT: retq
+; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5f,0xc2,0x10]
+; CHECK-FMA4-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
index ba80c839fdd..ee64790d1d9 100644
--- a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
@@ -18,11 +18,15 @@ entry:
}
; CHECK-LABEL: noDebug
-; CHECK: addq $24, %rsp
-; CHECK: popq %rbx
-; CHECK-NEXT: popq %r14
-; CHECK-NEXT: retq
-
+; CHECK: addq $16, %rsp
+; CHECK-NEXT: .cfi_adjust_cfa_offset -16
+; CHECK-NEXT: addq $8, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
define void @withDebug() !dbg !18 {
entry:
@@ -42,9 +46,11 @@ entry:
; CHECK-LABEL: withDebug
; CHECK: callq printf
; CHECK: callq printf
-; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: addq $16, %rsp
; CHECK: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
index f9ecf707810..de9d6bf93d6 100644
--- a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
@@ -9,6 +9,7 @@ define i64 @fn1NoDebug(i64 %a) {
; CHECK-LABEL: fn1NoDebug
; CHECK: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: ret
define i64 @fn1WithDebug(i64 %a) !dbg !4 {
@@ -19,6 +20,7 @@ define i64 @fn1WithDebug(i64 %a) !dbg !4 {
; CHECK-LABEL: fn1WithDebug
; CHECK: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: ret
%struct.Buffer = type { i8, [63 x i8] }
@@ -33,6 +35,7 @@ define void @fn2NoDebug(%struct.Buffer* byval align 64 %p1) {
; CHECK-NOT: sub
; CHECK: mov
; CHECK-NEXT: pop
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: ret
define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !8 {
@@ -46,6 +49,7 @@ define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !8 {
; CHECK-NOT: sub
; CHECK: mov
; CHECK-NEXT: pop
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: ret
declare i64 @fn(i64, i64)
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
index e32c7452b0c..7126fb233e6 100644
--- a/test/CodeGen/X86/haddsub-2.ll
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -724,11 +724,17 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: popq %r12
+; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: popq %r13
+; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: popq %r14
+; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: popq %r15
+; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: .cfi_def_cfa_offset 8
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_vphadd_w_test:
@@ -1351,11 +1357,17 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
+; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: popq %r12
+; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: popq %r13
+; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: popq %r14
+; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: popq %r15
+; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: .cfi_def_cfa_offset 8
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_hadd_w:
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
index efe07cf6301..ce2d0e9c671 100644
--- a/test/CodeGen/X86/hipe-cc64.ll
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -87,6 +87,7 @@ define cc 11 { i64, i64, i64 } @tailcaller(i64 %hp, i64 %p) #0 {
; CHECK-NEXT: movl $47, %ecx
; CHECK-NEXT: movl $63, %r8d
; CHECK-NEXT: popq %rax
+ ; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: jmp tailcallee
%ret = tail call cc11 { i64, i64, i64 } @tailcallee(i64 %hp, i64 %p, i64 15,
i64 31, i64 47, i64 63, i64 79) #1
diff --git a/test/CodeGen/X86/horizontal-reduce-smax.ll b/test/CodeGen/X86/horizontal-reduce-smax.ll
new file mode 100644
index 00000000000..8f5aac493b5
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -0,0 +1,1896 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp sgt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp sgt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v8i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v8i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp sgt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v16i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v16i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm6
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm6, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movapd %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm9, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm6
+; X64-SSE2-NEXT: por %xmm0, %xmm6
+; X64-SSE2-NEXT: pand %xmm8, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm8
+; X64-SSE2-NEXT: por %xmm1, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm6
+; X64-SSE2-NEXT: pandn %xmm8, %xmm1
+; X64-SSE2-NEXT: por %xmm6, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movapd %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: por %xmm1, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm4
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm4, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm5
+; X64-SSE2-NEXT: por %xmm1, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm4
+; X64-SSE2-NEXT: pandn %xmm5, %xmm0
+; X64-SSE2-NEXT: por %xmm4, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v32i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pmaxsw %xmm3, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm2, %xmm0
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v32i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pmaxsw %xmm3, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm2, %xmm0
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: por %xmm1, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm4
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm4, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm5
+; X64-SSE2-NEXT: por %xmm1, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm4
+; X64-SSE2-NEXT: pandn %xmm5, %xmm0
+; X64-SSE2-NEXT: por %xmm4, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp sgt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-smin.ll b/test/CodeGen/X86/horizontal-reduce-smin.ll
new file mode 100644
index 00000000000..6feb963426b
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -0,0 +1,1898 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp slt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp slt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v8i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v8i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp slt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v16i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v16i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm6, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movapd %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: pxor %xmm9, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm9, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm9, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm9, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm9, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm9
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movapd %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: por %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsd %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsd %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm4
+; X64-SSE2-NEXT: por %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm5
+; X64-SSE2-NEXT: pandn %xmm4, %xmm0
+; X64-SSE2-NEXT: por %xmm5, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsd %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsd %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v32i16:
+; X86-SSE: ## BB#0:
+; X86-SSE-NEXT: pminsw %xmm3, %xmm1
+; X86-SSE-NEXT: pminsw %xmm2, %xmm0
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE-NEXT: psrld $16, %xmm1
+; X86-SSE-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE-NEXT: movd %xmm1, %eax
+; X86-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE-LABEL: test_reduce_v32i16:
+; X64-SSE: ## BB#0:
+; X64-SSE-NEXT: pminsw %xmm3, %xmm1
+; X64-SSE-NEXT: pminsw %xmm2, %xmm0
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE-NEXT: psrld $16, %xmm1
+; X64-SSE-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE-NEXT: movd %xmm1, %eax
+; X64-SSE-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: por %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminsb %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm4
+; X64-SSE2-NEXT: por %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm5
+; X64-SSE2-NEXT: pandn %xmm4, %xmm0
+; X64-SSE2-NEXT: por %xmm5, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminsb %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp slt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll
new file mode 100644
index 00000000000..ee9d8955cb5
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -0,0 +1,2203 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp ugt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ugt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: movd %xmm3, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ugt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm4, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movd %xmm3, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm4, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm6
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm6, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE42-NEXT: pxor %xmm6, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm7
+; X86-SSE42-NEXT: pxor %xmm6, %xmm7
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movapd %xmm3, %xmm1
+; X86-SSE42-NEXT: xorpd %xmm6, %xmm1
+; X86-SSE42-NEXT: movapd %xmm2, %xmm0
+; X86-SSE42-NEXT: xorpd %xmm6, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm6
+; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm9, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm6
+; X64-SSE2-NEXT: por %xmm0, %xmm6
+; X64-SSE2-NEXT: pand %xmm8, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm8
+; X64-SSE2-NEXT: por %xmm1, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm6
+; X64-SSE2-NEXT: pandn %xmm8, %xmm1
+; X64-SSE2-NEXT: por %xmm6, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE42-NEXT: pxor %xmm6, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm7
+; X64-SSE42-NEXT: pxor %xmm6, %xmm7
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movapd %xmm3, %xmm1
+; X64-SSE42-NEXT: xorpd %xmm6, %xmm1
+; X64-SSE42-NEXT: movapd %xmm2, %xmm0
+; X64-SSE42-NEXT: xorpd %xmm6, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm6
+; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm7
+; X86-SSE2-NEXT: por %xmm0, %xmm7
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm6, %xmm1
+; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxud %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm7
+; X64-SSE2-NEXT: por %xmm0, %xmm7
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm7
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm7, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxud %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm7
+; X86-SSE2-NEXT: por %xmm0, %xmm7
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm6, %xmm1
+; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm7
+; X64-SSE2-NEXT: por %xmm0, %xmm7
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm7
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm7, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm2, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm2, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp ugt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll
new file mode 100644
index 00000000000..43369673042
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -0,0 +1,2207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE42-NEXT: pxor %xmm0, %xmm3
+; X86-SSE42-NEXT: pxor %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE42-NEXT: pxor %xmm0, %xmm3
+; X64-SSE42-NEXT: pxor %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp ult <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ult <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## BB#0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## BB#0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ult <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm6, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm5
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT: pxor %xmm4, %xmm6
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm6
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE42-NEXT: pxor %xmm4, %xmm7
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; X86-SSE42-NEXT: movapd %xmm2, %xmm1
+; X86-SSE42-NEXT: xorpd %xmm4, %xmm1
+; X86-SSE42-NEXT: movapd %xmm3, %xmm0
+; X86-SSE42-NEXT: xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm4
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: pxor %xmm9, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm9, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm9, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm9, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm9, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm9
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE42-NEXT: pxor %xmm4, %xmm6
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm6
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm7
+; X64-SSE42-NEXT: pxor %xmm4, %xmm7
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; X64-SSE42-NEXT: movapd %xmm2, %xmm1
+; X64-SSE42-NEXT: xorpd %xmm4, %xmm1
+; X64-SSE42-NEXT: movapd %xmm3, %xmm0
+; X64-SSE42-NEXT: xorpd %xmm4, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm4
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm7
+; X86-SSE2-NEXT: por %xmm1, %xmm7
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: movd %xmm4, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminud %xmm3, %xmm1
+; X86-SSE42-NEXT: pminud %xmm2, %xmm0
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm7
+; X64-SSE2-NEXT: por %xmm1, %xmm7
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm7, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: movd %xmm4, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminud %xmm3, %xmm1
+; X64-SSE42-NEXT: pminud %xmm2, %xmm0
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm7
+; X86-SSE2-NEXT: por %xmm1, %xmm7
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm2, %xmm4
+; X86-SSE2-NEXT: movd %xmm4, %eax
+; X86-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminuw %xmm3, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm7
+; X64-SSE2-NEXT: por %xmm1, %xmm7
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm7, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm2, %xmm4
+; X64-SSE2-NEXT: movd %xmm4, %eax
+; X64-SSE2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminuw %xmm3, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT: movd %xmm1, %eax
+; X64-SSE42-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## BB#0:
+; X86-SSE2-NEXT: pminub %xmm3, %xmm1
+; X86-SSE2-NEXT: pminub %xmm2, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## BB#0:
+; X86-SSE42-NEXT: pminub %xmm3, %xmm1
+; X86-SSE42-NEXT: pminub %xmm2, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## BB#0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## BB#0:
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## BB#0:
+; X64-SSE2-NEXT: pminub %xmm3, %xmm1
+; X64-SSE2-NEXT: pminub %xmm2, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## BB#0:
+; X64-SSE42-NEXT: pminub %xmm3, %xmm1
+; X64-SSE42-NEXT: pminub %xmm2, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## BB#0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## BB#0:
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp ult <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index fd503aa6c6e..e3b25a539c1 100644
--- a/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -81,6 +81,7 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
; X86-NEXT: orl %edx, %eax
; X86-NEXT: movw %ax, (%ecx)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: i24_insert_bit:
diff --git a/test/CodeGen/X86/imul.ll b/test/CodeGen/X86/imul.ll
index e364b001f94..02782f72108 100644
--- a/test/CodeGen/X86/imul.ll
+++ b/test/CodeGen/X86/imul.ll
@@ -307,6 +307,7 @@ define i64 @test5(i64 %a) {
; X86-NEXT: subl %ecx, %edx
; X86-NEXT: subl %esi, %edx
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, -31
@@ -362,6 +363,7 @@ define i64 @test7(i64 %a) {
; X86-NEXT: subl %ecx, %edx
; X86-NEXT: subl %esi, %edx
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, -33
@@ -390,6 +392,7 @@ define i64 @testOverflow(i64 %a) {
; X86-NEXT: addl %esi, %edx
; X86-NEXT: subl {{[0-9]+}}(%esp), %edx
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%tmp3 = mul i64 %a, 9223372036854775807
diff --git a/test/CodeGen/X86/inline-asm-A-constraint.ll b/test/CodeGen/X86/inline-asm-A-constraint.ll
index 2ad011e88e0..7975b318eff 100644
--- a/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-A-constraint.ll
@@ -19,8 +19,7 @@ entry:
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
ret { i64, i64 } %.fca.1.insert
}
-; CHECK: lock
-; CHECK-NEXT: cmpxchg16b
+; CHECK: lock cmpxchg16b
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/test/CodeGen/X86/lea-opt-cse1.ll b/test/CodeGen/X86/lea-opt-cse1.ll
index 05b47690e81..4c9ec3e0d7a 100644
--- a/test/CodeGen/X86/lea-opt-cse1.ll
+++ b/test/CodeGen/X86/lea-opt-cse1.ll
@@ -30,6 +30,7 @@ define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr {
; X86-NEXT: leal 1(%edx,%ecx), %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
diff --git a/test/CodeGen/X86/lea-opt-cse2.ll b/test/CodeGen/X86/lea-opt-cse2.ll
index 865dd49a6e1..cee6f6792cb 100644
--- a/test/CodeGen/X86/lea-opt-cse2.ll
+++ b/test/CodeGen/X86/lea-opt-cse2.ll
@@ -46,7 +46,9 @@ define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X86-NEXT: leal 1(%esi,%edx), %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
br label %loop
diff --git a/test/CodeGen/X86/lea-opt-cse3.ll b/test/CodeGen/X86/lea-opt-cse3.ll
index 87949b40d48..ed3aff98036 100644
--- a/test/CodeGen/X86/lea-opt-cse3.ll
+++ b/test/CodeGen/X86/lea-opt-cse3.ll
@@ -91,6 +91,7 @@ define i32 @foo1_mult_basic_blocks(i32 %a, i32 %b) local_unnamed_addr #0 {
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: .LBB2_2: # %exit
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%mul = shl i32 %b, 2
@@ -143,6 +144,7 @@ define i32 @foo1_mult_basic_blocks_illegal_scale(i32 %a, i32 %b) local_unnamed_a
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: .LBB3_2: # %exit
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%mul = shl i32 %b, 1
diff --git a/test/CodeGen/X86/lea-opt-cse4.ll b/test/CodeGen/X86/lea-opt-cse4.ll
index 31f31a73d44..d068180c39c 100644
--- a/test/CodeGen/X86/lea-opt-cse4.ll
+++ b/test/CodeGen/X86/lea-opt-cse4.ll
@@ -36,6 +36,7 @@ define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X86-NEXT: leal 1(%ecx,%edx), %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
%h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
@@ -110,7 +111,9 @@ define void @foo_loop(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: movl %edx, 16(%eax)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
entry:
br label %loop
diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll
index ca4cfa5b805..7dff2c20d5a 100644
--- a/test/CodeGen/X86/legalize-shift-64.ll
+++ b/test/CodeGen/X86/legalize-shift-64.ll
@@ -117,9 +117,13 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
; CHECK-NEXT: movl %esi, 4(%eax)
; CHECK-NEXT: movl %edi, (%eax)
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 12
; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl $4
%shl = shl <2 x i64> %A, %B
ret <2 x i64> %shl
@@ -160,6 +164,7 @@ define i32 @test6() {
; CHECK-NEXT: .LBB5_4: # %if.then
; CHECK-NEXT: movl %ebp, %esp
; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa %esp, 4
; CHECK-NEXT: retl
%x = alloca i32, align 4
%t = alloca i64, align 8
diff --git a/test/CodeGen/X86/live-out-reg-info.ll b/test/CodeGen/X86/live-out-reg-info.ll
index b838065beea..170f73593f6 100644
--- a/test/CodeGen/X86/live-out-reg-info.ll
+++ b/test/CodeGen/X86/live-out-reg-info.ll
@@ -18,6 +18,7 @@ define void @foo(i32 %a) {
; CHECK-NEXT: callq qux
; CHECK-NEXT: .LBB0_2: # %false
; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%t0 = lshr i32 %a, 23
br label %next
diff --git a/test/CodeGen/X86/load-combine.ll b/test/CodeGen/X86/load-combine.ll
index d1f5f41ac7b..d46efc4b5ec 100644
--- a/test/CodeGen/X86/load-combine.ll
+++ b/test/CodeGen/X86/load-combine.ll
@@ -376,6 +376,7 @@ define i32 @load_i32_by_i8_bswap_uses(i32* %arg) {
; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_uses:
@@ -496,6 +497,7 @@ define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
; CHECK-NEXT: movzbl 3(%ecx), %eax
; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between:
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 8983c3acb53..207175aae1a 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1057,9 +1057,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; SKX: # BB#0:
; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -1068,9 +1066,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; SKX_32: # BB#0:
; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
@@ -1105,9 +1101,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
; SKX: # BB#0:
; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
@@ -1117,9 +1111,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
; SKX_32: # BB#0:
; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX_32-NEXT: vzeroupper
@@ -1165,9 +1157,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; SKX: # BB#0:
; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
@@ -1176,9 +1166,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; SKX_32: # BB#0:
; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
; SKX_32-NEXT: vmovaps %xmm2, %xmm0
@@ -1702,6 +1690,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16i64:
@@ -1736,6 +1725,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: retl
%res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
ret <16 x i64> %res
@@ -1819,6 +1809,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16f64:
@@ -1853,6 +1844,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; SKX_32-NEXT: vmovapd %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: retl
%res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
ret <16 x double> %res
@@ -1934,6 +1926,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -1967,6 +1960,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
@@ -2050,6 +2044,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
@@ -2083,6 +2078,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
@@ -2127,6 +2123,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
+; KNL_32-NEXT: .cfi_def_cfa %esp, 4
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_pr28312:
@@ -2154,6 +2151,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
+; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; SKX_32-NEXT: retl
%g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
%g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index 3e257f5fd85..f43e3f6f56e 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -285,9 +285,7 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
@@ -327,9 +325,7 @@ define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
; AVX512F: ## BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
@@ -369,9 +365,7 @@ define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
; AVX512F: ## BB#0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; AVX512F-NEXT: retq
diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll
index 77d9fa69182..3f5eeba7055 100644
--- a/test/CodeGen/X86/memcmp-optsize.ll
+++ b/test/CodeGen/X86/memcmp-optsize.ll
@@ -156,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length3_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: cmpw (%ecx), %dx
-; X86-NEXT: jne .LBB5_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 2(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 2(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: cmpw (%eax), %dx
+; X86-NEXT: jne .LBB5_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 2(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 2(%eax), %dl
; X86-NEXT: je .LBB5_3
-; X86-NEXT: .LBB5_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB5_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB5_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length3_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
-; X64-NEXT: jne .LBB5_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB5_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 2(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 2(%rsi), %cl
; X64-NEXT: je .LBB5_3
-; X64-NEXT: .LBB5_1: # %res_block
+; X64-NEXT: .LBB5_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB5_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -314,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length5_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB10_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 4(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 4(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB10_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 4(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 4(%eax), %dl
; X86-NEXT: je .LBB10_3
-; X86-NEXT: .LBB10_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB10_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB10_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length5_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
-; X64-NEXT: jne .LBB10_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB10_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 4(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 4(%rsi), %cl
; X64-NEXT: je .LBB10_3
-; X64-NEXT: .LBB10_1: # %res_block
+; X64-NEXT: .LBB10_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB10_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -356,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length8:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -365,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB11_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB11_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
@@ -374,7 +374,7 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB11_3
-; X86-NEXT: .LBB11_1: # %res_block
+; X86-NEXT: .LBB11_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
@@ -400,22 +400,22 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length8_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB12_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movl 4(%eax), %edx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl 4(%ecx), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB12_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movl 4(%ecx), %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpl 4(%eax), %edx
; X86-NEXT: je .LBB12_3
-; X86-NEXT: .LBB12_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB12_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB12_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
;
@@ -432,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
define i1 @length8_eq_const(i8* %X) nounwind optsize {
; X86-LABEL: length8_eq_const:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB13_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
; X86-NEXT: je .LBB13_3
-; X86-NEXT: .LBB13_1: # %res_block
+; X86-NEXT: .LBB13_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: incl %eax
; X86-NEXT: .LBB13_3: # %endblock
@@ -473,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length12_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB14_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB14_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl 8(%rsi), %ecx
; X64-NEXT: je .LBB14_3
-; X64-NEXT: .LBB14_1: # %res_block
+; X64-NEXT: .LBB14_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB14_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -505,28 +505,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length12:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB15_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
; X64-NEXT: bswapl %ecx
; X64-NEXT: bswapl %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB15_1: # %res_block
+; X64-NEXT: je .LBB15_3
+; X64-NEXT: .LBB15_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB15_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
@@ -546,28 +545,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length16:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB16_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB16_1: # %res_block
+; X64-NEXT: je .LBB16_3
+; X64-NEXT: .LBB16_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB16_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
@@ -701,19 +699,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB20_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB20_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movq 16(%rdi), %rcx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
; X64-SSE2-NEXT: je .LBB20_3
-; X64-SSE2-NEXT: .LBB20_1: # %res_block
+; X64-SSE2-NEXT: .LBB20_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB20_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -721,18 +719,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length24_eq:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT: jne .LBB20_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB20_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: movq 16(%rdi), %rcx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx
; X64-AVX2-NEXT: je .LBB20_3
-; X64-AVX2-NEXT: .LBB20_1: # %res_block
+; X64-AVX2-NEXT: .LBB20_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB20_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -757,18 +755,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB21_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB21_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
; X64-SSE2-NEXT: je .LBB21_3
-; X64-SSE2-NEXT: .LBB21_1: # %res_block
+; X64-SSE2-NEXT: .LBB21_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB21_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -776,18 +774,18 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length24_eq_const:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX2-NEXT: jne .LBB21_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB21_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi)
; X64-AVX2-NEXT: je .LBB21_3
-; X64-AVX2-NEXT: .LBB21_1: # %res_block
+; X64-AVX2-NEXT: .LBB21_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB21_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -833,7 +831,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
@@ -841,8 +839,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB23_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB23_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -850,7 +848,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB23_3
-; X86-SSE2-NEXT: .LBB23_1: # %res_block
+; X86-SSE2-NEXT: .LBB23_2: # %res_block
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: incl %eax
; X86-SSE2-NEXT: .LBB23_3: # %endblock
@@ -859,14 +857,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB23_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB23_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -874,7 +872,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB23_3
-; X64-SSE2-NEXT: .LBB23_1: # %res_block
+; X64-SSE2-NEXT: .LBB23_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB23_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -909,21 +907,21 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize {
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB24_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB24_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB24_3
-; X86-SSE2-NEXT: .LBB24_1: # %res_block
+; X86-SSE2-NEXT: .LBB24_2: # %res_block
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: incl %eax
; X86-SSE2-NEXT: .LBB24_3: # %endblock
@@ -932,20 +930,20 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB24_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB24_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB24_3
-; X64-SSE2-NEXT: .LBB24_1: # %res_block
+; X64-SSE2-NEXT: .LBB24_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB24_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -1009,20 +1007,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB26_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB26_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB26_3
-; X64-AVX2-NEXT: .LBB26_1: # %res_block
+; X64-AVX2-NEXT: .LBB26_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB26_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -1059,20 +1057,20 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB27_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB27_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB27_3
-; X64-AVX2-NEXT: .LBB27_1: # %res_block
+; X64-AVX2-NEXT: .LBB27_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB27_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 393e4c42d8b..84fd45b0a08 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -187,35 +187,35 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length3_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: cmpw (%ecx), %dx
-; X86-NEXT: jne .LBB7_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 2(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 2(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: cmpw (%eax), %dx
+; X86-NEXT: jne .LBB7_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 2(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 2(%eax), %dl
; X86-NEXT: je .LBB7_3
-; X86-NEXT: .LBB7_1: # %res_block
-; X86-NEXT: movl $1, %eax
+; X86-NEXT: .LBB7_2: # %res_block
+; X86-NEXT: movl $1, %ecx
; X86-NEXT: .LBB7_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length3_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
-; X64-NEXT: jne .LBB7_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB7_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 2(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 2(%rsi), %cl
; X64-NEXT: je .LBB7_3
-; X64-NEXT: .LBB7_1: # %res_block
+; X64-NEXT: .LBB7_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB7_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -344,35 +344,35 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length5_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB12_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 4(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 4(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB12_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movb 4(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 4(%eax), %dl
; X86-NEXT: je .LBB12_3
-; X86-NEXT: .LBB12_1: # %res_block
-; X86-NEXT: movl $1, %eax
+; X86-NEXT: .LBB12_2: # %res_block
+; X86-NEXT: movl $1, %ecx
; X86-NEXT: .LBB12_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length5_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
-; X64-NEXT: jne .LBB12_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB12_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movb 4(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 4(%rsi), %cl
; X64-NEXT: je .LBB12_3
-; X64-NEXT: .LBB12_1: # %res_block
+; X64-NEXT: .LBB12_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB12_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -385,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -394,23 +394,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB13_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#3: # %endblock
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-; X86-NEXT: .LBB13_1: # %res_block
+; X86-NEXT: je .LBB13_3
+; X86-NEXT: .LBB13_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: setae %al
; X86-NEXT: leal -1(%eax,%eax), %eax
+; X86-NEXT: .LBB13_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
@@ -431,21 +429,21 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB14_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movl 4(%eax), %edx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl 4(%ecx), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB14_2
+; X86-NEXT: # BB#1: # %loadbb1
+; X86-NEXT: movl 4(%ecx), %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpl 4(%eax), %edx
; X86-NEXT: je .LBB14_3
-; X86-NEXT: .LBB14_1: # %res_block
-; X86-NEXT: movl $1, %eax
+; X86-NEXT: .LBB14_2: # %res_block
+; X86-NEXT: movl $1, %ecx
; X86-NEXT: .LBB14_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
;
@@ -462,15 +460,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
define i1 @length8_eq_const(i8* %X) nounwind {
; X86-LABEL: length8_eq_const:
-; X86: # BB#0: # %loadbb
+; X86: # BB#0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
-; X86-NEXT: jne .LBB15_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB15_2
+; X86-NEXT: # BB#1: # %loadbb1
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
; X86-NEXT: je .LBB15_3
-; X86-NEXT: .LBB15_1: # %res_block
+; X86-NEXT: .LBB15_2: # %res_block
; X86-NEXT: movl $1, %eax
; X86-NEXT: .LBB15_3: # %endblock
; X86-NEXT: testl %eax, %eax
@@ -502,16 +500,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length12_eq:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB16_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl 8(%rsi), %ecx
; X64-NEXT: je .LBB16_3
-; X64-NEXT: .LBB16_1: # %res_block
+; X64-NEXT: .LBB16_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB16_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -534,28 +532,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length12:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB17_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB17_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
; X64-NEXT: bswapl %ecx
; X64-NEXT: bswapl %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB17_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB17_1: # %res_block
+; X64-NEXT: je .LBB17_3
+; X64-NEXT: .LBB17_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB17_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
@@ -575,28 +572,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length16:
-; X64: # BB#0: # %loadbb
+; X64: # BB#0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB18_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB18_2
+; X64-NEXT: # BB#1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB18_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB18_1: # %res_block
+; X64-NEXT: je .LBB18_3
+; X64-NEXT: .LBB18_2: # %res_block
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: setae %al
; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB18_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
@@ -754,19 +750,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB22_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB22_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movq 16(%rdi), %rcx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
; X64-SSE2-NEXT: je .LBB22_3
-; X64-SSE2-NEXT: .LBB22_1: # %res_block
+; X64-SSE2-NEXT: .LBB22_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB22_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -774,18 +770,18 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX-LABEL: length24_eq:
-; X64-AVX: # BB#0: # %loadbb
+; X64-AVX: # BB#0:
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX-NEXT: jne .LBB22_1
-; X64-AVX-NEXT: # BB#2: # %loadbb1
+; X64-AVX-NEXT: jne .LBB22_2
+; X64-AVX-NEXT: # BB#1: # %loadbb1
; X64-AVX-NEXT: movq 16(%rdi), %rcx
; X64-AVX-NEXT: xorl %eax, %eax
; X64-AVX-NEXT: cmpq 16(%rsi), %rcx
; X64-AVX-NEXT: je .LBB22_3
-; X64-AVX-NEXT: .LBB22_1: # %res_block
+; X64-AVX-NEXT: .LBB22_2: # %res_block
; X64-AVX-NEXT: movl $1, %eax
; X64-AVX-NEXT: .LBB22_3: # %endblock
; X64-AVX-NEXT: testl %eax, %eax
@@ -810,18 +806,18 @@ define i1 @length24_eq_const(i8* %X) nounwind {
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB23_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB23_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
; X64-SSE2-NEXT: je .LBB23_3
-; X64-SSE2-NEXT: .LBB23_1: # %res_block
+; X64-SSE2-NEXT: .LBB23_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB23_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -829,18 +825,18 @@ define i1 @length24_eq_const(i8* %X) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX: # BB#0: # %loadbb
+; X64-AVX: # BB#0:
; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX-NEXT: jne .LBB23_1
-; X64-AVX-NEXT: # BB#2: # %loadbb1
+; X64-AVX-NEXT: jne .LBB23_2
+; X64-AVX-NEXT: # BB#1: # %loadbb1
; X64-AVX-NEXT: xorl %eax, %eax
; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
; X64-AVX-NEXT: cmpq %rcx, 16(%rdi)
; X64-AVX-NEXT: je .LBB23_3
-; X64-AVX-NEXT: .LBB23_1: # %res_block
+; X64-AVX-NEXT: .LBB23_2: # %res_block
; X64-AVX-NEXT: movl $1, %eax
; X64-AVX-NEXT: .LBB23_3: # %endblock
; X64-AVX-NEXT: testl %eax, %eax
@@ -898,7 +894,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
@@ -906,8 +902,8 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB25_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB25_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -915,7 +911,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB25_3
-; X86-SSE2-NEXT: .LBB25_1: # %res_block
+; X86-SSE2-NEXT: .LBB25_2: # %res_block
; X86-SSE2-NEXT: movl $1, %eax
; X86-SSE2-NEXT: .LBB25_3: # %endblock
; X86-SSE2-NEXT: testl %eax, %eax
@@ -923,14 +919,14 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB25_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB25_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -938,7 +934,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB25_3
-; X64-SSE2-NEXT: .LBB25_1: # %res_block
+; X64-SSE2-NEXT: .LBB25_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB25_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -946,20 +942,20 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1: # BB#0: # %loadbb
+; X64-AVX1: # BB#0:
; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX1-NEXT: jne .LBB25_1
-; X64-AVX1-NEXT: # BB#2: # %loadbb1
+; X64-AVX1-NEXT: jne .LBB25_2
+; X64-AVX1-NEXT: # BB#1: # %loadbb1
; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-AVX1-NEXT: je .LBB25_3
-; X64-AVX1-NEXT: .LBB25_1: # %res_block
+; X64-AVX1-NEXT: .LBB25_2: # %res_block
; X64-AVX1-NEXT: movl $1, %eax
; X64-AVX1-NEXT: .LBB25_3: # %endblock
; X64-AVX1-NEXT: testl %eax, %eax
@@ -1006,21 +1002,21 @@ define i1 @length32_eq_const(i8* %X) nounwind {
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2: # BB#0: # %loadbb
+; X86-SSE2: # BB#0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
-; X86-SSE2-NEXT: jne .LBB26_1
-; X86-SSE2-NEXT: # BB#2: # %loadbb1
+; X86-SSE2-NEXT: jne .LBB26_2
+; X86-SSE2-NEXT: # BB#1: # %loadbb1
; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X86-SSE2-NEXT: xorl %eax, %eax
; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X86-SSE2-NEXT: je .LBB26_3
-; X86-SSE2-NEXT: .LBB26_1: # %res_block
+; X86-SSE2-NEXT: .LBB26_2: # %res_block
; X86-SSE2-NEXT: movl $1, %eax
; X86-SSE2-NEXT: .LBB26_3: # %endblock
; X86-SSE2-NEXT: testl %eax, %eax
@@ -1028,20 +1024,20 @@ define i1 @length32_eq_const(i8* %X) nounwind {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2: # BB#0: # %loadbb
+; X64-SSE2: # BB#0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT: jne .LBB26_1
-; X64-SSE2-NEXT: # BB#2: # %loadbb1
+; X64-SSE2-NEXT: jne .LBB26_2
+; X64-SSE2-NEXT: # BB#1: # %loadbb1
; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
; X64-SSE2-NEXT: xorl %eax, %eax
; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-SSE2-NEXT: je .LBB26_3
-; X64-SSE2-NEXT: .LBB26_1: # %res_block
+; X64-SSE2-NEXT: .LBB26_2: # %res_block
; X64-SSE2-NEXT: movl $1, %eax
; X64-SSE2-NEXT: .LBB26_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
@@ -1049,20 +1045,20 @@ define i1 @length32_eq_const(i8* %X) nounwind {
; X64-SSE2-NEXT: retq
;
; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1: # BB#0: # %loadbb
+; X64-AVX1: # BB#0:
; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
-; X64-AVX1-NEXT: jne .LBB26_1
-; X64-AVX1-NEXT: # BB#2: # %loadbb1
+; X64-AVX1-NEXT: jne .LBB26_2
+; X64-AVX1-NEXT: # BB#1: # %loadbb1
; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
; X64-AVX1-NEXT: xorl %eax, %eax
; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; X64-AVX1-NEXT: je .LBB26_3
-; X64-AVX1-NEXT: .LBB26_1: # %res_block
+; X64-AVX1-NEXT: .LBB26_2: # %res_block
; X64-AVX1-NEXT: movl $1, %eax
; X64-AVX1-NEXT: .LBB26_3: # %endblock
; X64-AVX1-NEXT: testl %eax, %eax
@@ -1136,20 +1132,20 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind {
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB28_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB28_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB28_3
-; X64-AVX2-NEXT: .LBB28_1: # %res_block
+; X64-AVX2-NEXT: .LBB28_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB28_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
@@ -1197,20 +1193,20 @@ define i1 @length64_eq_const(i8* %X) nounwind {
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2: # BB#0: # %loadbb
+; X64-AVX2: # BB#0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
; X64-AVX2-NEXT: cmpl $-1, %eax
-; X64-AVX2-NEXT: jne .LBB29_1
-; X64-AVX2-NEXT: # BB#2: # %loadbb1
+; X64-AVX2-NEXT: jne .LBB29_2
+; X64-AVX2-NEXT: # BB#1: # %loadbb1
; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
; X64-AVX2-NEXT: xorl %eax, %eax
; X64-AVX2-NEXT: cmpl $-1, %ecx
; X64-AVX2-NEXT: je .LBB29_3
-; X64-AVX2-NEXT: .LBB29_1: # %res_block
+; X64-AVX2-NEXT: .LBB29_2: # %res_block
; X64-AVX2-NEXT: movl $1, %eax
; X64-AVX2-NEXT: .LBB29_3: # %endblock
; X64-AVX2-NEXT: testl %eax, %eax
diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll
index f0a957c9417..98e09377ddb 100644
--- a/test/CodeGen/X86/memset-nonzero.ll
+++ b/test/CodeGen/X86/memset-nonzero.ll
@@ -148,6 +148,7 @@ define void @memset_256_nonzero_bytes(i8* %x) {
; SSE-NEXT: movl $256, %edx # imm = 0x100
; SSE-NEXT: callq memset
; SSE-NEXT: popq %rax
+; SSE-NEXT: .cfi_def_cfa_offset 8
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_256_nonzero_bytes:
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
index e414f5554de..b909b7c403b 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -72,7 +72,9 @@ define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: movl %esi, 4(%eax)
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_2i64_i64_12:
@@ -384,6 +386,7 @@ define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %ecx, 12(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5:
@@ -435,7 +438,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %ecx, 12(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
@@ -490,7 +495,9 @@ define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movl %ecx, 12(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
@@ -649,7 +656,9 @@ define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl $0, 12(%eax)
; X32-SSE1-NEXT: movl $0, 8(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
@@ -701,7 +710,9 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline
; X32-SSE1-NEXT: movl $0, 12(%eax)
; X32-SSE1-NEXT: movl $0, 8(%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
@@ -751,7 +762,9 @@ define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: movl %esi, 6(%eax)
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_8i16_i16_23u567u9:
@@ -897,9 +910,13 @@ define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noin
; X32-SSE1-NEXT: movl %esi, 3(%eax)
; X32-SSE1-NEXT: movw %bp, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
; X32-SSE1-NEXT: popl %ebx
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %ebp
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
@@ -1129,7 +1146,9 @@ define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinlin
; X32-SSE1-NEXT: movl %esi, 4(%eax)
; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 4
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_2i64_i64_12_volatile:
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index 051c8a710c8..ddcc383b65e 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -382,8 +382,10 @@ entry:
; LINUX: pushl $1
; LINUX: .cfi_adjust_cfa_offset 4
; LINUX: calll good
-; LINUX: addl $28, %esp
+; LINUX: addl $16, %esp
; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX: addl $12, %esp
+; LINUX: .cfi_def_cfa_offset 4
; LINUX-NOT: add
; LINUX: retl
define void @pr27140() optsize {
diff --git a/test/CodeGen/X86/mul-constant-result.ll b/test/CodeGen/X86/mul-constant-result.ll
index 011b63ce726..f778397f889 100644
--- a/test/CodeGen/X86/mul-constant-result.ll
+++ b/test/CodeGen/X86/mul-constant-result.ll
@@ -34,84 +34,116 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: .LBB0_6:
; X86-NEXT: addl %eax, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_39:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB0_40:
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_7:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_8:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $2, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_9:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_10:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_11:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (,%eax,8), %ecx
; X86-NEXT: jmp .LBB0_12
; X86-NEXT: .LBB0_13:
; X86-NEXT: shll $3, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_14:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_15:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_16:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_17:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_18:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_19:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: jmp .LBB0_20
; X86-NEXT: .LBB0_21:
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_22:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $4, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_23:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $4, %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_24:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_25:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: shll $2, %ecx
; X86-NEXT: jmp .LBB0_12
@@ -119,20 +151,26 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_27:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_28:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: .LBB0_20:
; X86-NEXT: leal (%eax,%ecx,4), %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_29:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: shll $3, %ecx
; X86-NEXT: jmp .LBB0_12
@@ -140,13 +178,17 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: shll $3, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_31:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_32:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
; X86-NEXT: jmp .LBB0_12
@@ -154,21 +196,27 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_34:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_35:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
; X86-NEXT: addl %eax, %ecx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_36:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $5, %ecx
; X86-NEXT: subl %eax, %ecx
@@ -180,10 +228,13 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: subl %eax, %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
; X86-NEXT: .LBB0_38:
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: shll $5, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-HSW-LABEL: mult:
@@ -857,8 +908,11 @@ define i32 @foo() local_unnamed_addr #0 {
; X86-NEXT: negl %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-HSW-LABEL: foo:
@@ -1072,10 +1126,15 @@ define i32 @foo() local_unnamed_addr #0 {
; X64-HSW-NEXT: negl %ecx
; X64-HSW-NEXT: movl %ecx, %eax
; X64-HSW-NEXT: addq $8, %rsp
+; X64-HSW-NEXT: .cfi_def_cfa_offset 40
; X64-HSW-NEXT: popq %rbx
+; X64-HSW-NEXT: .cfi_def_cfa_offset 32
; X64-HSW-NEXT: popq %r14
+; X64-HSW-NEXT: .cfi_def_cfa_offset 24
; X64-HSW-NEXT: popq %r15
+; X64-HSW-NEXT: .cfi_def_cfa_offset 16
; X64-HSW-NEXT: popq %rbp
+; X64-HSW-NEXT: .cfi_def_cfa_offset 8
; X64-HSW-NEXT: retq
%1 = tail call i32 @mult(i32 1, i32 0)
%2 = icmp ne i32 %1, 1
diff --git a/test/CodeGen/X86/mul-i256.ll b/test/CodeGen/X86/mul-i256.ll
index 0a48ae761ec..1e05b95dda0 100644
--- a/test/CodeGen/X86/mul-i256.ll
+++ b/test/CodeGen/X86/mul-i256.ll
@@ -349,10 +349,15 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
; X32-NEXT: movl %eax, 24(%ecx)
; X32-NEXT: movl %edx, 28(%ecx)
; X32-NEXT: addl $88, %esp
+; X32-NEXT: .cfi_def_cfa_offset 20
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X64-LABEL: test:
@@ -421,8 +426,11 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
; X64-NEXT: movq %rax, 16(%r9)
; X64-NEXT: movq %rdx, 24(%r9)
; X64-NEXT: popq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: popq %r14
+; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: popq %r15
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
%av = load i256, i256* %a
diff --git a/test/CodeGen/X86/mul128.ll b/test/CodeGen/X86/mul128.ll
index 70a6173a19f..0c11f17d8d1 100644
--- a/test/CodeGen/X86/mul128.ll
+++ b/test/CodeGen/X86/mul128.ll
@@ -86,10 +86,15 @@ define i128 @foo(i128 %t, i128 %u) {
; X86-NEXT: movl %edx, 12(%ecx)
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: addl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 20
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl $4
%k = mul i128 %t, %u
ret i128 %k
diff --git a/test/CodeGen/X86/no-plt.ll b/test/CodeGen/X86/no-plt.ll
new file mode 100644
index 00000000000..d6383c2d7d1
--- /dev/null
+++ b/test/CodeGen/X86/no-plt.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
+; RUN: | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu \
+; RUN: | FileCheck -check-prefix=X64 %s
+
+define i32 @main() #0 {
+; X64: callq *_Z3foov@GOTPCREL(%rip)
+; X64: callq _Z3barv
+; X64: callq _Z3bazv
+
+entry:
+ %retval = alloca i32, align 4
+ store i32 0, i32* %retval, align 4
+ %call1 = call i32 @_Z3foov()
+ %call2 = call i32 @_Z3barv()
+ %call3 = call i32 @_Z3bazv()
+ ret i32 0
+}
+
+; Function Attrs: nonlazybind
+declare i32 @_Z3foov() #1
+
+declare i32 @_Z3barv() #2
+
+; Function Attrs: nonlazybind
+declare hidden i32 @_Z3bazv() #3
+
+
+attributes #1 = { nonlazybind }
+attributes #3 = { nonlazybind }
diff --git a/test/CodeGen/X86/pop-stack-cleanup-msvc.ll b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll
new file mode 100644
index 00000000000..6330d3de72f
--- /dev/null
+++ b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "i686--windows-msvc"
+
+declare { i8*, i32 } @param2_ret2(i32, i32)
+declare i32 @__CxxFrameHandler3(...)
+
+
+define void @test_reserved_regs() minsize optsize personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: test_reserved_regs:
+; CHECK: calll _param2_ret2
+; CHECK-NEXT: popl %ecx
+; CHECK-NEXT: popl %edi
+start:
+ %s = alloca i64
+ store i64 4, i64* %s
+ %0 = invoke { i8*, i32 } @param2_ret2(i32 0, i32 1)
+ to label %out unwind label %cleanup
+
+out:
+ ret void
+
+cleanup:
+ %cp = cleanuppad within none []
+ cleanupret from %cp unwind to caller
+}
diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll
index 74f6c5a361f..54eb1fc7272 100644
--- a/test/CodeGen/X86/pr21792.ll
+++ b/test/CodeGen/X86/pr21792.ll
@@ -28,6 +28,7 @@ define void @func(<4 x float> %vx) {
; CHECK-NEXT: leaq stuff+8(%r9), %r9
; CHECK-NEXT: callq toto
; CHECK-NEXT: popq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
%tmp2 = bitcast <4 x float> %vx to <2 x i64>
diff --git a/test/CodeGen/X86/pr29061.ll b/test/CodeGen/X86/pr29061.ll
index 0cbe75f9ad5..b62d082507d 100644
--- a/test/CodeGen/X86/pr29061.ll
+++ b/test/CodeGen/X86/pr29061.ll
@@ -15,6 +15,7 @@ define void @t1(i8 signext %c) {
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
tail call void asm sideeffect "", "{di},~{dirflag},~{fpsr},~{flags}"(i8 %c)
@@ -32,6 +33,7 @@ define void @t2(i8 signext %c) {
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
tail call void asm sideeffect "", "{si},~{dirflag},~{fpsr},~{flags}"(i8 %c)
diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll
index cc670eeb978..d791936bd53 100644
--- a/test/CodeGen/X86/pr29112.ll
+++ b/test/CodeGen/X86/pr29112.ll
@@ -65,6 +65,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: addq $88, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17>
diff --git a/test/CodeGen/X86/pr30430.ll b/test/CodeGen/X86/pr30430.ll
index 0254c0940b8..06007a3a4cf 100644
--- a/test/CodeGen/X86/pr30430.ll
+++ b/test/CodeGen/X86/pr30430.ll
@@ -108,6 +108,7 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
; CHECK-NEXT: vmovss %xmm14, (%rsp) # 4-byte Spill
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: retq
entry:
%__A.addr.i = alloca float, align 4
diff --git a/test/CodeGen/X86/pr32241.ll b/test/CodeGen/X86/pr32241.ll
index f48fef5f7fb..02f3bb12291 100644
--- a/test/CodeGen/X86/pr32241.ll
+++ b/test/CodeGen/X86/pr32241.ll
@@ -50,7 +50,9 @@ define i32 @_Z3foov() {
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp)
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: addl $16, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
%aa = alloca i16, align 2
diff --git a/test/CodeGen/X86/pr32256.ll b/test/CodeGen/X86/pr32256.ll
index f6e254aaad0..5b6126fbc76 100644
--- a/test/CodeGen/X86/pr32256.ll
+++ b/test/CodeGen/X86/pr32256.ll
@@ -27,6 +27,7 @@ define void @_Z1av() {
; CHECK-NEXT: andb $1, %al
; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp)
; CHECK-NEXT: addl $2, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
%b = alloca i8, align 1
diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll
index d6e6f6eb107..67a0332ac53 100644
--- a/test/CodeGen/X86/pr32282.ll
+++ b/test/CodeGen/X86/pr32282.ll
@@ -43,6 +43,7 @@ define void @foo() {
; X86-NEXT: orl %eax, %edx
; X86-NEXT: setne {{[0-9]+}}(%esp)
; X86-NEXT: popl %eax
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: foo:
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index 11eb6968709..59be67f0579 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -71,6 +71,7 @@ define void @foo() {
; 686-O0-NEXT: movzbl %al, %ecx
; 686-O0-NEXT: movl %ecx, (%esp)
; 686-O0-NEXT: addl $8, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 4
; 686-O0-NEXT: retl
;
; 686-LABEL: foo:
@@ -88,6 +89,7 @@ define void @foo() {
; 686-NEXT: setle %dl
; 686-NEXT: movl %edx, {{[0-9]+}}(%esp)
; 686-NEXT: addl $8, %esp
+; 686-NEXT: .cfi_def_cfa_offset 4
; 686-NEXT: retl
entry:
%a = alloca i8, align 1
@@ -232,10 +234,15 @@ define void @f1() {
; 686-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
; 686-O0-NEXT: movl %esi, (%esp) # 4-byte Spill
; 686-O0-NEXT: addl $36, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 20
; 686-O0-NEXT: popl %esi
+; 686-O0-NEXT: .cfi_def_cfa_offset 16
; 686-O0-NEXT: popl %edi
+; 686-O0-NEXT: .cfi_def_cfa_offset 12
; 686-O0-NEXT: popl %ebx
+; 686-O0-NEXT: .cfi_def_cfa_offset 8
; 686-O0-NEXT: popl %ebp
+; 686-O0-NEXT: .cfi_def_cfa_offset 4
; 686-O0-NEXT: retl
;
; 686-LABEL: f1:
@@ -277,8 +284,11 @@ define void @f1() {
; 686-NEXT: movl %eax, _ZN8struct_210member_2_0E
; 686-NEXT: movl $0, _ZN8struct_210member_2_0E+4
; 686-NEXT: addl $1, %esp
+; 686-NEXT: .cfi_def_cfa_offset 12
; 686-NEXT: popl %esi
+; 686-NEXT: .cfi_def_cfa_offset 8
; 686-NEXT: popl %edi
+; 686-NEXT: .cfi_def_cfa_offset 4
; 686-NEXT: retl
entry:
%a = alloca i8, align 1
@@ -392,8 +402,11 @@ define void @f2() {
; 686-O0-NEXT: movw %cx, %di
; 686-O0-NEXT: movw %di, (%eax)
; 686-O0-NEXT: addl $2, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 12
; 686-O0-NEXT: popl %esi
+; 686-O0-NEXT: .cfi_def_cfa_offset 8
; 686-O0-NEXT: popl %edi
+; 686-O0-NEXT: .cfi_def_cfa_offset 4
; 686-O0-NEXT: retl
;
; 686-LABEL: f2:
@@ -414,6 +427,7 @@ define void @f2() {
; 686-NEXT: sete %dl
; 686-NEXT: movw %dx, (%eax)
; 686-NEXT: addl $2, %esp
+; 686-NEXT: .cfi_def_cfa_offset 4
; 686-NEXT: retl
entry:
%a = alloca i16, align 2
@@ -532,6 +546,7 @@ define void @f3() #0 {
; 686-O0-NEXT: popl %esi
; 686-O0-NEXT: popl %edi
; 686-O0-NEXT: popl %ebp
+; 686-O0-NEXT: .cfi_def_cfa %esp, 4
; 686-O0-NEXT: retl
;
; 686-LABEL: f3:
@@ -558,6 +573,7 @@ define void @f3() #0 {
; 686-NEXT: movl %ecx, var_46
; 686-NEXT: movl %ebp, %esp
; 686-NEXT: popl %ebp
+; 686-NEXT: .cfi_def_cfa %esp, 4
; 686-NEXT: retl
entry:
%a = alloca i64, align 8
diff --git a/test/CodeGen/X86/pr32329.ll b/test/CodeGen/X86/pr32329.ll
index f6bdade24c6..9d1bb90e824 100644
--- a/test/CodeGen/X86/pr32329.ll
+++ b/test/CodeGen/X86/pr32329.ll
@@ -57,9 +57,13 @@ define void @foo() local_unnamed_addr {
; X86-NEXT: imull %eax, %ebx
; X86-NEXT: movb %bl, var_218
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: popl %edi
+; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: popl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: popl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: foo:
diff --git a/test/CodeGen/X86/pr32345.ll b/test/CodeGen/X86/pr32345.ll
index f6802887e9e..2bdeca20731 100644
--- a/test/CodeGen/X86/pr32345.ll
+++ b/test/CodeGen/X86/pr32345.ll
@@ -84,6 +84,7 @@ define void @foo() {
; 6860-NEXT: popl %edi
; 6860-NEXT: popl %ebx
; 6860-NEXT: popl %ebp
+; 6860-NEXT: .cfi_def_cfa %esp, 4
; 6860-NEXT: retl
;
; X64-LABEL: foo:
@@ -127,6 +128,7 @@ define void @foo() {
; 686-NEXT: movb %dl, (%eax)
; 686-NEXT: movl %ebp, %esp
; 686-NEXT: popl %ebp
+; 686-NEXT: .cfi_def_cfa %esp, 4
; 686-NEXT: retl
bb:
%tmp = alloca i64, align 8
diff --git a/test/CodeGen/X86/pr32451.ll b/test/CodeGen/X86/pr32451.ll
index 67c0cb39f8c..5b7d1373d34 100644
--- a/test/CodeGen/X86/pr32451.ll
+++ b/test/CodeGen/X86/pr32451.ll
@@ -30,7 +30,9 @@ define i8** @japi1_convert_690(i8**, i8***, i32) {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
; CHECK-NEXT: movl %eax, (%ecx)
; CHECK-NEXT: addl $16, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
top:
%3 = alloca i8***
diff --git a/test/CodeGen/X86/pr34088.ll b/test/CodeGen/X86/pr34088.ll
index 2049c5507c6..4d85722057f 100644
--- a/test/CodeGen/X86/pr34088.ll
+++ b/test/CodeGen/X86/pr34088.ll
@@ -27,6 +27,7 @@ define i32 @pr34088() local_unnamed_addr {
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %ebp, %esp
; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: .cfi_def_cfa %esp, 4
; CHECK-NEXT: retl
entry:
%foo = alloca %struct.Foo, align 4
diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll
new file mode 100644
index 00000000000..129dbcacc95
--- /dev/null
+++ b/test/CodeGen/X86/pr34653.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+avx512f -o - | FileCheck %s
+
+declare fastcc <38 x double> @test()
+
+define void @pr34653() {
+; CHECK-LABEL: pr34653:
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-512, %rsp # imm = 0xFE00
+; CHECK-NEXT: subq $2048, %rsp # imm = 0x800
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: callq test
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: vmovaps %xmm0, %xmm1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vmovaps %xmm3, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm5
+; CHECK-NEXT: vmovaps %xmm5, %xmm6
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm7
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm8
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm9
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm10
+; CHECK-NEXT: vextractf32x4 $3, %zmm10, %xmm11
+; CHECK-NEXT: vmovaps %xmm11, %xmm12
+; CHECK-NEXT: vextractf32x4 $2, %zmm10, %xmm13
+; CHECK-NEXT: vmovaps %xmm13, %xmm14
+; CHECK-NEXT: vmovaps %xmm10, %xmm15
+; CHECK-NEXT: vmovaps %xmm15, %xmm2
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
+; CHECK-NEXT: # kill: %YMM10<def> %YMM10<kill> %ZMM10<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm10, %xmm10
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm10, %xmm0
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: %YMM9<def> %YMM9<kill> %ZMM9<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm9, %xmm9
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm9, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: %YMM8<def> %YMM8<kill> %ZMM8<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm8, %xmm8
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm8, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: %YMM7<def> %YMM7<kill> %ZMM7<kill>
+; CHECK-NEXT: vextractf128 $1, %ymm7, %xmm7
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm7, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm8, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm13, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm1, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm14, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm2, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm4, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm9, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm10, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm15, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm11, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm3, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm6, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm5, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm12, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm7, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %v = call fastcc <38 x double> @test()
+ %v.0 = extractelement <38 x double> %v, i32 0
+ ret void
+}
+
diff --git a/test/CodeGen/X86/pr34657.ll b/test/CodeGen/X86/pr34657.ll
new file mode 100644
index 00000000000..a63bc2a08dd
--- /dev/null
+++ b/test/CodeGen/X86/pr34657.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw -o - | FileCheck %s
+
+define <112 x i8> @pr34657() local_unnamed_addr {
+; CHECK-LABEL: pr34657
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: vmovups (%rax), %xmm0
+; CHECK-NEXT: vmovups (%rax), %ymm1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovups (%rax), %zmm2
+; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
+; CHECK-NEXT: vmovaps %zmm2, (%rdi)
+; CHECK-NEXT: vextractf32x4 $2, %zmm0, 96(%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %wide.vec51 = load <112 x i8>, <112 x i8>* undef, align 2
+ ret <112 x i8> %wide.vec51
+}
diff --git a/test/CodeGen/X86/pr9743.ll b/test/CodeGen/X86/pr9743.ll
index 73b3c7f835c..ac3d4575510 100644
--- a/test/CodeGen/X86/pr9743.ll
+++ b/test/CodeGen/X86/pr9743.ll
@@ -11,4 +11,5 @@ define void @f() {
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: ret
diff --git a/test/CodeGen/X86/push-cfi-debug.ll b/test/CodeGen/X86/push-cfi-debug.ll
index 7f438e306e4..01fa12e87d0 100644
--- a/test/CodeGen/X86/push-cfi-debug.ll
+++ b/test/CodeGen/X86/push-cfi-debug.ll
@@ -23,8 +23,10 @@ declare x86_stdcallcc void @stdfoo(i32, i32) #0
; CHECK: .cfi_adjust_cfa_offset 4
; CHECK: calll stdfoo
; CHECK: .cfi_adjust_cfa_offset -8
-; CHECK: addl $20, %esp
+; CHECK: addl $8, %esp
; CHECK: .cfi_adjust_cfa_offset -8
+; CHECK: addl $12, %esp
+; CHECK: .cfi_def_cfa_offset 4
define void @test1() #0 !dbg !4 {
entry:
tail call void @foo(i32 1, i32 2) #1, !dbg !10
diff --git a/test/CodeGen/X86/push-cfi-obj.ll b/test/CodeGen/X86/push-cfi-obj.ll
index 33291ec3318..2c9ec334027 100644
--- a/test/CodeGen/X86/push-cfi-obj.ll
+++ b/test/CodeGen/X86/push-cfi-obj.ll
@@ -12,7 +12,7 @@
; LINUX-NEXT: ]
; LINUX-NEXT: Address: 0x0
; LINUX-NEXT: Offset: 0x68
-; LINUX-NEXT: Size: 64
+; LINUX-NEXT: Size: 72
; LINUX-NEXT: Link: 0
; LINUX-NEXT: Info: 0
; LINUX-NEXT: AddressAlignment: 4
@@ -22,8 +22,9 @@
; LINUX-NEXT: SectionData (
; LINUX-NEXT: 0000: 1C000000 00000000 017A504C 5200017C |.........zPLR..||
; LINUX-NEXT: 0010: 08070000 00000000 1B0C0404 88010000 |................|
-; LINUX-NEXT: 0020: 1C000000 24000000 00000000 1D000000 |....$...........|
+; LINUX-NEXT: 0020: 24000000 24000000 00000000 1D000000 |$...$...........|
; LINUX-NEXT: 0030: 04000000 00410E08 8502420D 05432E10 |.....A....B..C..|
+; LINUX-NEXT: 0040: 540C0404 410C0508 |T...A...|
; LINUX-NEXT: )
declare i32 @__gxx_personality_v0(...)
@@ -35,7 +36,7 @@ entry:
to label %continue unwind label %cleanup
continue:
ret void
-cleanup:
+cleanup:
landingpad { i8*, i32 }
cleanup
ret void
diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll
index 91e579a8391..44f8bf857c4 100644
--- a/test/CodeGen/X86/push-cfi.ll
+++ b/test/CodeGen/X86/push-cfi.ll
@@ -74,8 +74,9 @@ cleanup:
; LINUX-NEXT: pushl $1
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: call
-; LINUX-NEXT: addl $28, %esp
+; LINUX-NEXT: addl $16, %esp
; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX: addl $12, %esp
; DARWIN-NOT: .cfi_escape
; DARWIN-NOT: pushl
define void @test2_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 0e9d149373b..296d165b3eb 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -144,14 +144,14 @@ define float @f32_one_step(float %x) #1 {
;
; KNL-LABEL: f32_one_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: f32_one_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -257,7 +257,7 @@ define float @f32_two_step(float %x) #2 {
;
; KNL-LABEL: f32_two_step:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
@@ -268,7 +268,7 @@ define float @f32_two_step(float %x) #2 {
;
; SKX-LABEL: f32_two_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -416,7 +416,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
;
; SKX-LABEL: v4f32_one_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -533,7 +533,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
;
; SKX-LABEL: v4f32_two_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -691,7 +691,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
;
; SKX-LABEL: v8f32_one_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
@@ -821,7 +821,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
;
; SKX-LABEL: v8f32_two_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index a263e9d3b65..f6eeeec57f1 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -56,13 +56,13 @@ define float @f32_no_step_2(float %x) #3 {
;
; KNL-LABEL: f32_no_step_2:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
; KNL-NEXT: retq # sched: [2:1.00]
;
; SKX-LABEL: f32_no_step_2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 1234.0, %x
@@ -144,7 +144,7 @@ define float @f32_one_step_2(float %x) #1 {
;
; KNL-LABEL: f32_one_step_2:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [5:0.50]
@@ -152,7 +152,7 @@ define float @f32_one_step_2(float %x) #1 {
;
; SKX-LABEL: f32_one_step_2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
@@ -243,7 +243,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
;
; KNL-LABEL: f32_one_step_2_divs:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [5:0.50]
@@ -252,7 +252,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
;
; SKX-LABEL: f32_one_step_2_divs:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
@@ -368,7 +368,7 @@ define float @f32_two_step_2(float %x) #2 {
;
; KNL-LABEL: f32_two_step_2:
; KNL: # BB#0:
-; KNL-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
@@ -380,7 +380,7 @@ define float @f32_two_step_2(float %x) #2 {
;
; SKX-LABEL: f32_two_step_2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -478,7 +478,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
;
; SKX-LABEL: v4f32_one_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
@@ -580,7 +580,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
;
; SKX-LABEL: v4f32_one_step_2_divs:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
@@ -708,7 +708,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
;
; SKX-LABEL: v4f32_two_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
@@ -814,7 +814,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
;
; SKX-LABEL: v8f32_one_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
@@ -925,7 +925,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
;
; SKX-LABEL: v8f32_one_step_2_divs:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
@@ -1067,7 +1067,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
;
; SKX-LABEL: v8f32_two_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
@@ -1124,7 +1124,7 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
;
; SKX-LABEL: v8f32_no_step:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
@@ -1183,7 +1183,7 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
;
; SKX-LABEL: v8f32_no_step2:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
diff --git a/test/CodeGen/X86/return-ext.ll b/test/CodeGen/X86/return-ext.ll
index ef160f43b4a..c66e518943a 100644
--- a/test/CodeGen/X86/return-ext.ll
+++ b/test/CodeGen/X86/return-ext.ll
@@ -106,6 +106,7 @@ entry:
; CHECK: call
; CHECK-NEXT: movzbl
; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}}
; CHECK-NEXT: ret
}
@@ -120,6 +121,7 @@ entry:
; CHECK: call
; CHECK-NEXT: movzbl
; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}}
; CHECK-NEXT: ret
}
@@ -134,5 +136,6 @@ entry:
; CHECK: call
; CHECK-NEXT: movzwl
; CHECK-NEXT: {{pop|add}}
+; CHECK-NEXT: .cfi_def_cfa_offset {{4|8}}
; CHECK-NEXT: ret
}
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
index bd2d3e544bd..a1feeb5999b 100644
--- a/test/CodeGen/X86/rtm.ll
+++ b/test/CodeGen/X86/rtm.ll
@@ -75,6 +75,7 @@ define void @f2(i32 %x) nounwind uwtable {
; X64-NEXT: xabort $1
; X64-NEXT: callq f1
; X64-NEXT: popq %rax
+; X64-NEXT: .cfi_def_cfa_offset 8
; X64-NEXT: retq
entry:
%x.addr = alloca i32, align 4
diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll
new file mode 100644
index 00000000000..5dc06e61cc6
--- /dev/null
+++ b/test/CodeGen/X86/schedule-x86_32.ll
@@ -0,0 +1,348 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=i686 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i8 @test_aaa(i8 %a0) optsize {
+; GENERIC-LABEL: test_aaa:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aaa
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aaa:
+; ATOM: # BB#0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aaa
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aaa:
+; SLM: # BB#0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aaa
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aaa:
+; SANDY: # BB#0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aaa
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aaa:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aaa
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aaa:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aaa
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aaa:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aaa
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aaa:
+; SKX: # BB#0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aaa
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aaa:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aaa
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aaa:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aaa
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aaa", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
+
+define i8 @test_aad(i16 %a0) optsize {
+; GENERIC-LABEL: test_aad:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aad
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aad:
+; ATOM: # BB#0:
+; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aad
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aad:
+; SLM: # BB#0:
+; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aad
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aad:
+; SANDY: # BB#0:
+; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aad
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aad:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aad
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aad:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aad
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aad:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aad
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aad:
+; SKX: # BB#0:
+; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aad
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aad:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aad
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aad:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aad
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aad", "=r,r"(i16 %a0) nounwind
+ ret i8 %1
+}
+
+define i16 @test_aam(i8 %a0) optsize {
+; GENERIC-LABEL: test_aam:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aam
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aam:
+; ATOM: # BB#0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aam
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aam:
+; SLM: # BB#0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aam
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aam:
+; SANDY: # BB#0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aam
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aam:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aam
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aam:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aam
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aam:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aam
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aam:
+; SKX: # BB#0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aam
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aam:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aam
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aam:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aam
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i16 asm "aam", "=r,r"(i8 %a0) nounwind
+ ret i16 %1
+}
+
+define i8 @test_aas(i8 %a0) optsize {
+; GENERIC-LABEL: test_aas:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aas
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aas:
+; ATOM: # BB#0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aas
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aas:
+; SLM: # BB#0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aas
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aas:
+; SANDY: # BB#0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aas
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aas:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aas
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [5:0.50]
+;
+; BROADWELL-LABEL: test_aas:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aas
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aas:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aas
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aas:
+; SKX: # BB#0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aas
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aas:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aas
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aas:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aas
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aas", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
diff --git a/test/CodeGen/X86/schedule-x86_64.ll b/test/CodeGen/X86/schedule-x86_64.ll
new file mode 100644
index 00000000000..1db8c8768bd
--- /dev/null
+++ b/test/CodeGen/X86/schedule-x86_64.ll
@@ -0,0 +1,737 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_bsf16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_bsf16:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfw %di, %ax
+; GENERIC-NEXT: bsfw (%rsi), %cx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf16:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfw %di, %ax
+; ATOM-NEXT: bsfw (%rsi), %cx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf16:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfw %di, %ax
+; SLM-NEXT: bsfw (%rsi), %cx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf16:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfw %di, %ax
+; SANDY-NEXT: bsfw (%rsi), %cx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf16:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfw %di, %ax
+; HASWELL-NEXT: bsfw (%rsi), %cx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsf16:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfw %di, %ax
+; BROADWELL-NEXT: bsfw (%rsi), %cx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf16:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfw %di, %ax
+; SKYLAKE-NEXT: bsfw (%rsi), %cx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf16:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfw %di, %ax
+; SKX-NEXT: bsfw (%rsi), %cx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf16:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfw %di, %ax
+; BTVER2-NEXT: bsfw (%rsi), %cx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf16:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfw %di, %ax
+; ZNVER1-NEXT: bsfw (%rsi), %cx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i16, i16 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1)
+ %2 = extractvalue { i16, i16 } %1, 0
+ %3 = extractvalue { i16, i16 } %1, 1
+ %4 = or i16 %2, %3
+ ret i16 %4
+}
+define i32 @test_bsf32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_bsf32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfl %edi, %eax
+; GENERIC-NEXT: bsfl (%rsi), %ecx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf32:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfl %edi, %eax
+; ATOM-NEXT: bsfl (%rsi), %ecx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf32:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfl %edi, %eax
+; SLM-NEXT: bsfl (%rsi), %ecx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf32:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfl %edi, %eax
+; SANDY-NEXT: bsfl (%rsi), %ecx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfl %edi, %eax
+; HASWELL-NEXT: bsfl (%rsi), %ecx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsf32:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfl %edi, %eax
+; BROADWELL-NEXT: bsfl (%rsi), %ecx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf32:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfl %edi, %eax
+; SKYLAKE-NEXT: bsfl (%rsi), %ecx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf32:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfl %edi, %eax
+; SKX-NEXT: bsfl (%rsi), %ecx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfl %edi, %eax
+; BTVER2-NEXT: bsfl (%rsi), %ecx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf32:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfl %edi, %eax
+; ZNVER1-NEXT: bsfl (%rsi), %ecx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i32, i32 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1)
+ %2 = extractvalue { i32, i32 } %1, 0
+ %3 = extractvalue { i32, i32 } %1, 1
+ %4 = or i32 %2, %3
+ ret i32 %4
+}
+define i64 @test_bsf64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_bsf64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfq %rdi, %rax
+; GENERIC-NEXT: bsfq (%rsi), %rcx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf64:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfq %rdi, %rax
+; ATOM-NEXT: bsfq (%rsi), %rcx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf64:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfq %rdi, %rax
+; SLM-NEXT: bsfq (%rsi), %rcx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf64:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfq %rdi, %rax
+; SANDY-NEXT: bsfq (%rsi), %rcx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfq %rdi, %rax
+; HASWELL-NEXT: bsfq (%rsi), %rcx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsf64:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfq %rdi, %rax
+; BROADWELL-NEXT: bsfq (%rsi), %rcx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf64:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfq %rdi, %rax
+; SKYLAKE-NEXT: bsfq (%rsi), %rcx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf64:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfq %rdi, %rax
+; SKX-NEXT: bsfq (%rsi), %rcx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfq %rdi, %rax
+; BTVER2-NEXT: bsfq (%rsi), %rcx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfq %rdi, %rax
+; ZNVER1-NEXT: bsfq (%rsi), %rcx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i64, i64 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1)
+ %2 = extractvalue { i64, i64 } %1, 0
+ %3 = extractvalue { i64, i64 } %1, 1
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+
+define i16 @test_bsr16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_bsr16:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrw %di, %ax
+; GENERIC-NEXT: bsrw (%rsi), %cx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr16:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrw %di, %ax
+; ATOM-NEXT: bsrw (%rsi), %cx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr16:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrw %di, %ax
+; SLM-NEXT: bsrw (%rsi), %cx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr16:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrw %di, %ax
+; SANDY-NEXT: bsrw (%rsi), %cx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr16:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrw %di, %ax
+; HASWELL-NEXT: bsrw (%rsi), %cx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsr16:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrw %di, %ax
+; BROADWELL-NEXT: bsrw (%rsi), %cx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr16:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrw %di, %ax
+; SKYLAKE-NEXT: bsrw (%rsi), %cx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr16:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrw %di, %ax
+; SKX-NEXT: bsrw (%rsi), %cx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr16:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrw %di, %ax
+; BTVER2-NEXT: bsrw (%rsi), %cx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr16:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrw %di, %ax
+; ZNVER1-NEXT: bsrw (%rsi), %cx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i16, i16 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1)
+ %2 = extractvalue { i16, i16 } %1, 0
+ %3 = extractvalue { i16, i16 } %1, 1
+ %4 = or i16 %2, %3
+ ret i16 %4
+}
+define i32 @test_bsr32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_bsr32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrl %edi, %eax
+; GENERIC-NEXT: bsrl (%rsi), %ecx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr32:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrl %edi, %eax
+; ATOM-NEXT: bsrl (%rsi), %ecx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr32:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrl %edi, %eax
+; SLM-NEXT: bsrl (%rsi), %ecx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr32:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrl %edi, %eax
+; SANDY-NEXT: bsrl (%rsi), %ecx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrl %edi, %eax
+; HASWELL-NEXT: bsrl (%rsi), %ecx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsr32:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrl %edi, %eax
+; BROADWELL-NEXT: bsrl (%rsi), %ecx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr32:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrl %edi, %eax
+; SKYLAKE-NEXT: bsrl (%rsi), %ecx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr32:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrl %edi, %eax
+; SKX-NEXT: bsrl (%rsi), %ecx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrl %edi, %eax
+; BTVER2-NEXT: bsrl (%rsi), %ecx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr32:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrl %edi, %eax
+; ZNVER1-NEXT: bsrl (%rsi), %ecx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i32, i32 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1)
+ %2 = extractvalue { i32, i32 } %1, 0
+ %3 = extractvalue { i32, i32 } %1, 1
+ %4 = or i32 %2, %3
+ ret i32 %4
+}
+define i64 @test_bsr64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_bsr64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrq %rdi, %rax
+; GENERIC-NEXT: bsrq (%rsi), %rcx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr64:
+; ATOM: # BB#0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrq %rdi, %rax
+; ATOM-NEXT: bsrq (%rsi), %rcx
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr64:
+; SLM: # BB#0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrq %rdi, %rax
+; SLM-NEXT: bsrq (%rsi), %rcx
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr64:
+; SANDY: # BB#0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrq %rdi, %rax
+; SANDY-NEXT: bsrq (%rsi), %rcx
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrq %rdi, %rax
+; HASWELL-NEXT: bsrq (%rsi), %rcx
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bsr64:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrq %rdi, %rax
+; BROADWELL-NEXT: bsrq (%rsi), %rcx
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr64:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrq %rdi, %rax
+; SKYLAKE-NEXT: bsrq (%rsi), %rcx
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr64:
+; SKX: # BB#0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrq %rdi, %rax
+; SKX-NEXT: bsrq (%rsi), %rcx
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrq %rdi, %rax
+; BTVER2-NEXT: bsrq (%rsi), %rcx
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrq %rdi, %rax
+; ZNVER1-NEXT: bsrq (%rsi), %rcx
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i64, i64 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1)
+ %2 = extractvalue { i64, i64 } %1, 0
+ %3 = extractvalue { i64, i64 } %1, 1
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+
+define i32 @test_bswap32(i32 %a0) optsize {
+; GENERIC-LABEL: test_bswap32:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: bswapl %edi # sched: [2:1.00]
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bswap32:
+; ATOM: # BB#0:
+; ATOM-NEXT: bswapl %edi # sched: [1:1.00]
+; ATOM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bswap32:
+; SLM: # BB#0:
+; SLM-NEXT: bswapl %edi # sched: [1:0.50]
+; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bswap32:
+; SANDY: # BB#0:
+; SANDY-NEXT: bswapl %edi # sched: [2:1.00]
+; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bswap32:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: bswapl %edi # sched: [2:0.50]
+; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bswap32:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: bswapl %edi # sched: [2:0.50]
+; BROADWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bswap32:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: bswapl %edi # sched: [2:0.50]
+; SKYLAKE-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bswap32:
+; SKX: # BB#0:
+; SKX-NEXT: bswapl %edi # sched: [2:0.50]
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bswap32:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: bswapl %edi # sched: [1:0.50]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bswap32:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: bswapl %edi # sched: [1:1.00]
+; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = tail call i32 asm "bswap $0", "=r,0"(i32 %a0) nounwind
+ ret i32 %1
+}
+define i64 @test_bswap64(i64 %a0) optsize {
+; GENERIC-LABEL: test_bswap64:
+; GENERIC: # BB#0:
+; GENERIC-NEXT: bswapq %rdi # sched: [2:1.00]
+; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bswap64:
+; ATOM: # BB#0:
+; ATOM-NEXT: bswapq %rdi # sched: [1:1.00]
+; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bswap64:
+; SLM: # BB#0:
+; SLM-NEXT: bswapq %rdi # sched: [1:0.50]
+; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bswap64:
+; SANDY: # BB#0:
+; SANDY-NEXT: bswapq %rdi # sched: [2:1.00]
+; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bswap64:
+; HASWELL: # BB#0:
+; HASWELL-NEXT: bswapq %rdi # sched: [2:0.50]
+; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [2:1.00]
+;
+; BROADWELL-LABEL: test_bswap64:
+; BROADWELL: # BB#0:
+; BROADWELL-NEXT: bswapq %rdi # sched: [2:0.50]
+; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bswap64:
+; SKYLAKE: # BB#0:
+; SKYLAKE-NEXT: bswapq %rdi # sched: [2:0.50]
+; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bswap64:
+; SKX: # BB#0:
+; SKX-NEXT: bswapq %rdi # sched: [2:0.50]
+; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bswap64:
+; BTVER2: # BB#0:
+; BTVER2-NEXT: bswapq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bswap64:
+; ZNVER1: # BB#0:
+; ZNVER1-NEXT: bswapq %rdi # sched: [1:1.00]
+; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = tail call i64 asm "bswap $0", "=r,0"(i64 %a0) nounwind
+ ret i64 %1
+}
diff --git a/test/CodeGen/X86/select-mmx.ll b/test/CodeGen/X86/select-mmx.ll
index 795990e3c32..7ad8b6f1b9c 100644
--- a/test/CodeGen/X86/select-mmx.ll
+++ b/test/CodeGen/X86/select-mmx.ll
@@ -48,6 +48,7 @@ define i64 @test47(i64 %arg) {
; I32-NEXT: movl {{[0-9]+}}(%esp), %edx
; I32-NEXT: movl %ebp, %esp
; I32-NEXT: popl %ebp
+; I32-NEXT: .cfi_def_cfa %esp, 4
; I32-NEXT: retl
%cond = icmp eq i64 %arg, 0
%slct = select i1 %cond, x86_mmx bitcast (i64 7 to x86_mmx), x86_mmx bitcast (i64 0 to x86_mmx)
@@ -100,6 +101,7 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
; I32-NEXT: movl {{[0-9]+}}(%esp), %edx
; I32-NEXT: movl %ebp, %esp
; I32-NEXT: popl %ebp
+; I32-NEXT: .cfi_def_cfa %esp, 4
; I32-NEXT: retl
%cond = icmp eq i64 %arg, 0
%xmmx = bitcast i64 %x to x86_mmx
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 52225397ef0..c3674639eab 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -15,7 +15,6 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
; CHECK-NEXT: cmovneq %rdi, %rsi
; CHECK-NEXT: movl (%rsi), %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test1:
; MCU: # BB#0:
@@ -45,7 +44,7 @@ define i32 @test2() nounwind {
; GENERIC-NEXT: callq _return_false
; GENERIC-NEXT: xorl %ecx, %ecx
; GENERIC-NEXT: testb $1, %al
-; GENERIC-NEXT: movl $-480, %eax
+; GENERIC-NEXT: movl $-480, %eax ## imm = 0xFE20
; GENERIC-NEXT: cmovnel %ecx, %eax
; GENERIC-NEXT: shll $3, %eax
; GENERIC-NEXT: cmpl $32768, %eax ## imm = 0x8000
@@ -55,14 +54,13 @@ define i32 @test2() nounwind {
; GENERIC-NEXT: popq %rcx
; GENERIC-NEXT: retq
; GENERIC-NEXT: LBB1_1: ## %bb90
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test2:
; ATOM: ## BB#0: ## %entry
; ATOM-NEXT: pushq %rax
; ATOM-NEXT: callq _return_false
; ATOM-NEXT: xorl %ecx, %ecx
-; ATOM-NEXT: movl $-480, %edx
+; ATOM-NEXT: movl $-480, %edx ## imm = 0xFE20
; ATOM-NEXT: testb $1, %al
; ATOM-NEXT: cmovnel %ecx, %edx
; ATOM-NEXT: shll $3, %edx
@@ -73,17 +71,16 @@ define i32 @test2() nounwind {
; ATOM-NEXT: popq %rcx
; ATOM-NEXT: retq
; ATOM-NEXT: LBB1_1: ## %bb90
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test2:
; MCU: # BB#0: # %entry
; MCU-NEXT: calll return_false
-; MCU-NEXT: xorl %ecx, %ecx
+; MCU-NEXT: xorl %ecx, %ecx
; MCU-NEXT: testb $1, %al
; MCU-NEXT: jne .LBB1_2
; MCU-NEXT: # BB#1: # %entry
; MCU-NEXT: movl $-480, %ecx # imm = 0xFE20
-; MCU-NEXT: .LBB1_2:
+; MCU-NEXT: .LBB1_2: # %entry
; MCU-NEXT: shll $3, %ecx
; MCU-NEXT: cmpl $32768, %ecx # imm = 0x8000
; MCU-NEXT: jge .LBB1_3
@@ -116,7 +113,6 @@ define float @test3(i32 %x) nounwind readnone {
; CHECK-NEXT: leaq {{.*}}(%rip), %rcx
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test3:
; MCU: # BB#0: # %entry
@@ -140,7 +136,6 @@ define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly {
; CHECK-NEXT: seta %al
; CHECK-NEXT: movsbl (%rdi,%rax,4), %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test4:
; MCU: # BB#0: # %entry
@@ -175,7 +170,6 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-NEXT: movd %xmm0, (%rsi)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test5:
; MCU: # BB#0:
@@ -211,7 +205,6 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
; CHECK-NEXT: mulps %xmm0, %xmm0
; CHECK-NEXT: movaps %xmm0, (%rsi)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test6:
; MCU: # BB#0:
@@ -283,7 +276,6 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
; CHECK-NEXT: leaq {{.*}}(%rip), %rcx
; CHECK-NEXT: fldt (%rax,%rcx)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test7:
; MCU: # BB#0:
@@ -333,7 +325,6 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; GENERIC-NEXT: movq %xmm1, 16(%rsi)
; GENERIC-NEXT: movdqa %xmm0, (%rsi)
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test8:
; ATOM: ## BB#0:
@@ -366,7 +357,6 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
; ATOM-NEXT: movdqa %xmm0, (%rsi)
; ATOM-NEXT: movq %xmm1, 16(%rsi)
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test8:
; MCU: # BB#0:
@@ -456,7 +446,6 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9:
; ATOM: ## BB#0:
@@ -466,7 +455,6 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9:
; MCU: # BB#0:
@@ -493,7 +481,6 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9a:
; ATOM: ## BB#0:
@@ -503,7 +490,6 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9a:
; MCU: # BB#0:
@@ -528,7 +514,6 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9b:
; ATOM: ## BB#0:
@@ -538,7 +523,6 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9b:
; MCU: # BB#0:
@@ -566,7 +550,6 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-NEXT: setne %al
; CHECK-NEXT: leaq -1(%rax,%rax), %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test10:
; MCU: # BB#0:
@@ -592,7 +575,6 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-NEXT: notq %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test11:
; MCU: # BB#0:
@@ -619,7 +601,6 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-NEXT: notq %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test11a:
; MCU: # BB#0:
@@ -649,7 +630,6 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
; GENERIC-NEXT: movq $-1, %rdi
; GENERIC-NEXT: cmovnoq %rax, %rdi
; GENERIC-NEXT: jmp __Znam ## TAILCALL
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test12:
; ATOM: ## BB#0: ## %entry
@@ -659,7 +639,6 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
; ATOM-NEXT: movq $-1, %rdi
; ATOM-NEXT: cmovnoq %rax, %rdi
; ATOM-NEXT: jmp __Znam ## TAILCALL
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test12:
; MCU: # BB#0: # %entry
@@ -710,7 +689,6 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
; GENERIC-NEXT: cmpl %esi, %edi
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test13:
; ATOM: ## BB#0:
@@ -721,7 +699,6 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test13:
; MCU: # BB#0:
@@ -741,7 +718,6 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
; CHECK-NEXT: setae %al
; CHECK-NEXT: negl %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test14:
; MCU: # BB#0:
@@ -763,7 +739,6 @@ define i32 @test15(i32 %x) nounwind {
; GENERIC-NEXT: negl %edi
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test15:
; ATOM: ## BB#0: ## %entry
@@ -774,7 +749,6 @@ define i32 @test15(i32 %x) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test15:
; MCU: # BB#0: # %entry
@@ -826,7 +800,6 @@ define i16 @test17(i16 %x) nounwind {
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test17:
; ATOM: ## BB#0: ## %entry
@@ -838,7 +811,6 @@ define i16 @test17(i16 %x) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test17:
; MCU: # BB#0: # %entry
@@ -859,7 +831,6 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
; GENERIC-NEXT: cmovgel %edx, %esi
; GENERIC-NEXT: movl %esi, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test18:
; ATOM: ## BB#0:
@@ -869,7 +840,6 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test18:
; MCU: # BB#0:
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 20c77a4a517..5ae2cc5f35c 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -23,10 +23,9 @@ define <8 x i16> @pr25080(<8 x i32> %a) {
;
; KNL-32-LABEL: pr25080:
; KNL-32: # BB#0: # %entry
-; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
-; KNL-32-NEXT: vpand %ymm1, %ymm0, %ymm0
-; KNL-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-32-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-32-NEXT: vbroadcastss {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
+; KNL-32-NEXT: vptestnmd %zmm1, %zmm0, %k0
; KNL-32-NEXT: movb $15, %al
; KNL-32-NEXT: kmovw %eax, %k1
; KNL-32-NEXT: korw %k1, %k0, %k1
@@ -90,6 +89,7 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
; KNL-32-NEXT: jne .LBB1_1
; KNL-32-NEXT: # BB#2: # %for_exit600
; KNL-32-NEXT: popl %esi
+; KNL-32-NEXT: .cfi_def_cfa_offset 4
; KNL-32-NEXT: retl
allocas:
br label %for_test11.preheader
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index 79cf0f2c8f1..a2767205fe2 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -31,6 +31,7 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi8:
@@ -89,6 +90,7 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_4xi8:
@@ -148,6 +150,7 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_8xi8:
@@ -220,6 +223,7 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_16xi8:
@@ -288,6 +292,7 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi16:
@@ -342,6 +347,7 @@ define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_4xi16:
@@ -399,6 +405,7 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_8xi16:
@@ -469,6 +476,7 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_16xi16:
@@ -541,6 +549,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
; X86-NEXT: psrad $16, %xmm0
; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi8_sext:
@@ -606,6 +615,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi8_sext_zext:
@@ -666,6 +676,7 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi16_sext:
@@ -733,6 +744,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_2xi16_sext_zext:
@@ -813,6 +825,7 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
; X86-NEXT: popl %esi
+; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl
;
; X64-LABEL: mul_16xi16_sext:
diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll
index f178e18a259..ca74ee5732d 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -401,15 +401,10 @@ define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
; SSE-NEXT: rcpps %xmm0, %xmm0 ## encoding: [0x0f,0x53,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; AVX2-LABEL: test_x86_sse_rcp_ps:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rcp_ps:
-; SKX: ## BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
-; SKX-NEXT: retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rcp_ps:
+; VCHECK: ## BB#0:
+; VCHECK-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
+; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -438,15 +433,10 @@ define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
; SSE-NEXT: rsqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x52,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; AVX2-LABEL: test_x86_sse_rsqrt_ps:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rsqrt_ps:
-; SKX: ## BB#0:
-; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
-; SKX-NEXT: retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rsqrt_ps:
+; VCHECK: ## BB#0:
+; VCHECK-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
+; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -475,10 +465,15 @@ define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
; SSE-NEXT: sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse_sqrt_ps:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_sqrt_ps:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_sqrt_ps:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -491,10 +486,15 @@ define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
; SSE-NEXT: sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse_sqrt_ss:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_sqrt_ss:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_sqrt_ss:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index b5c2bff4b8f..d3c995197e8 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -2547,8 +2547,8 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
;
; SKX-LABEL: test_rcpps:
; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT: vrcp14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT: vrcpps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps (%rdi), %xmm1 # sched: [10:1.00]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
@@ -2719,8 +2719,8 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
;
; SKX-LABEL: test_rsqrtps:
; SKX: # BB#0:
-; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT: vrsqrt14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index d4047faad9b..72c68c56638 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1592,10 +1592,15 @@ define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
; SSE-NEXT: sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse2_sqrt_pd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_sqrt_pd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_sqrt_pd:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1608,10 +1613,15 @@ define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
; SSE-NEXT: sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse2_sqrt_sd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_sqrt_sd:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_sqrt_sd:
+; SKX: ## BB#0:
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1637,7 +1647,7 @@ define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) {
; SKX: ## BB#0:
; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SKX-NEXT: vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00]
-; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%a1 = load <2 x double>, <2 x double>* %a0, align 16
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1]
diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll
index bd2dd53b654..d80c87b99b6 100644
--- a/test/CodeGen/X86/statepoint-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-call-lowering.ll
@@ -83,6 +83,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
; CHECK: callq return_i1
; CHECK-NEXT: .Ltmp5:
; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
%safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, i32 addrspace(1)* %a)
diff --git a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
index b88ca03805f..90f2002e2d4 100644
--- a/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
+++ b/test/CodeGen/X86/statepoint-gctransition-call-lowering.ll
@@ -69,6 +69,7 @@ define i1 @test_relocate(i32 addrspace(1)* %a) gc "statepoint-example" {
; CHECK: callq return_i1
; CHECK-NEXT: .Ltmp4:
; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
entry:
%safepoint_token = tail call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 1, i32 0, i32 0, i32 addrspace(1)* %a)
diff --git a/test/CodeGen/X86/statepoint-invoke.ll b/test/CodeGen/X86/statepoint-invoke.ll
index 784b932addc..5aa902546c1 100644
--- a/test/CodeGen/X86/statepoint-invoke.ll
+++ b/test/CodeGen/X86/statepoint-invoke.ll
@@ -142,6 +142,7 @@ normal_return:
; CHECK-LABEL: %normal_return
; CHECK: xorl %eax, %eax
; CHECK-NEXT: popq
+ ; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%null.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13)
%undef.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14)
@@ -169,6 +170,7 @@ entry:
normal_return:
; CHECK: leaq
; CHECK-NEXT: popq
+ ; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%aa.rel = call coldcc i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(token %sp, i32 13, i32 13)
%aa.converted = bitcast i32 addrspace(1)* %aa.rel to i64 addrspace(1)*
@@ -177,6 +179,7 @@ normal_return:
exceptional_return:
; CHECK: movl $15
; CHECK-NEXT: popq
+ ; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
%landing_pad = landingpad token
cleanup
diff --git a/test/CodeGen/X86/throws-cfi-fp.ll b/test/CodeGen/X86/throws-cfi-fp.ll
new file mode 100644
index 00000000000..bacd965054c
--- /dev/null
+++ b/test/CodeGen/X86/throws-cfi-fp.ll
@@ -0,0 +1,98 @@
+; RUN: llc %s -o - | FileCheck %s
+
+; ModuleID = 'throws-cfi-fp.cpp'
+source_filename = "throws-cfi-fp.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__clang_call_terminate = comdat any
+
+@_ZL11ShouldThrow = internal unnamed_addr global i1 false, align 1
+@_ZTIi = external constant i8*
+@str = private unnamed_addr constant [20 x i8] c"Threw an exception!\00"
+
+; Function Attrs: uwtable
+define void @_Z6throwsv() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+
+; CHECK-LABEL: _Z6throwsv:
+; CHECK: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: .cfi_def_cfa %rbp, 16
+
+entry:
+ %.b5 = load i1, i1* @_ZL11ShouldThrow, align 1
+ br i1 %.b5, label %if.then, label %try.cont
+
+if.then: ; preds = %entry
+ %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+ %0 = bitcast i8* %exception to i32*
+ store i32 1, i32* %0, align 16
+ invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+ to label %unreachable unwind label %lpad
+
+lpad: ; preds = %if.then
+ %1 = landingpad { i8*, i32 }
+ catch i8* null
+ %2 = extractvalue { i8*, i32 } %1, 0
+ %3 = tail call i8* @__cxa_begin_catch(i8* %2)
+ %puts = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str, i64 0, i64 0))
+ invoke void @__cxa_rethrow()
+ to label %unreachable unwind label %lpad1
+
+lpad1: ; preds = %lpad
+ %4 = landingpad { i8*, i32 }
+ cleanup
+ invoke void @__cxa_end_catch()
+ to label %eh.resume unwind label %terminate.lpad
+
+try.cont: ; preds = %entry
+ ret void
+
+eh.resume: ; preds = %lpad1
+ resume { i8*, i32 } %4
+
+terminate.lpad: ; preds = %lpad1
+ %5 = landingpad { i8*, i32 }
+ catch i8* null
+ %6 = extractvalue { i8*, i32 } %5, 0
+ tail call void @__clang_call_terminate(i8* %6)
+ unreachable
+
+unreachable: ; preds = %lpad, %if.then
+ unreachable
+}
+
+declare i8* @__cxa_allocate_exception(i64)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_rethrow()
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+declare void @__clang_call_terminate(i8*)
+
+declare void @_ZSt9terminatev()
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly)
+
+attributes #0 = { "no-frame-pointer-elim"="true" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!8, !9, !10}
+
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 316ebefb7fff8ad324a08a694347500b6cd7c95f) (https://github.com/llvm-mirror/llvm.git dcae9be81fc17cdfbe989402354d3c8ecd0a2c79)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "throws-cfi-fp.cpp", directory: "epilogue-dwarf/test")
+!4 = !{}
+!5 = !{}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
diff --git a/test/CodeGen/X86/throws-cfi-no-fp.ll b/test/CodeGen/X86/throws-cfi-no-fp.ll
new file mode 100644
index 00000000000..1483e6b8483
--- /dev/null
+++ b/test/CodeGen/X86/throws-cfi-no-fp.ll
@@ -0,0 +1,97 @@
+; RUN: llc %s -o - | FileCheck %s
+
+; ModuleID = 'throws-cfi-no-fp.cpp'
+source_filename = "throws-cfi-no-fp.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+$__clang_call_terminate = comdat any
+
+@_ZL11ShouldThrow = internal unnamed_addr global i1 false, align 1
+@_ZTIi = external constant i8*
+@str = private unnamed_addr constant [20 x i8] c"Threw an exception!\00"
+
+; Function Attrs: uwtable
+define void @_Z6throwsv() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+
+; CHECK-LABEL: _Z6throwsv:
+; CHECK: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_1:
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+
+entry:
+ %.b5 = load i1, i1* @_ZL11ShouldThrow, align 1
+ br i1 %.b5, label %if.then, label %try.cont
+
+if.then: ; preds = %entry
+ %exception = tail call i8* @__cxa_allocate_exception(i64 4)
+ %0 = bitcast i8* %exception to i32*
+ store i32 1, i32* %0, align 16
+ invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+ to label %unreachable unwind label %lpad
+
+lpad: ; preds = %if.then
+ %1 = landingpad { i8*, i32 }
+ catch i8* null
+ %2 = extractvalue { i8*, i32 } %1, 0
+ %3 = tail call i8* @__cxa_begin_catch(i8* %2)
+ %puts = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @str, i64 0, i64 0))
+ invoke void @__cxa_rethrow() #4
+ to label %unreachable unwind label %lpad1
+
+lpad1: ; preds = %lpad
+ %4 = landingpad { i8*, i32 }
+ cleanup
+ invoke void @__cxa_end_catch()
+ to label %eh.resume unwind label %terminate.lpad
+
+try.cont: ; preds = %entry
+ ret void
+
+eh.resume: ; preds = %lpad1
+ resume { i8*, i32 } %4
+
+terminate.lpad: ; preds = %lpad1
+ %5 = landingpad { i8*, i32 }
+ catch i8* null
+ %6 = extractvalue { i8*, i32 } %5, 0
+ tail call void @__clang_call_terminate(i8* %6)
+ unreachable
+
+unreachable: ; preds = %lpad, %if.then
+ unreachable
+}
+
+declare i8* @__cxa_allocate_exception(i64)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_rethrow()
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: noinline noreturn nounwind
+declare void @__clang_call_terminate(i8*)
+
+declare void @_ZSt9terminatev()
+
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly)
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!8, !9, !10}
+
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 6.0.0 (https://github.com/llvm-mirror/clang.git 316ebefb7fff8ad324a08a694347500b6cd7c95f) (https://github.com/llvm-mirror/llvm.git dcae9be81fc17cdfbe989402354d3c8ecd0a2c79)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "throws-cfi-no-fp.cpp", directory: "epilogue-dwarf/test")
+!4 = !{}
+!5 = !{}
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
diff --git a/test/CodeGen/X86/var-permute-128.ll b/test/CodeGen/X86/var-permute-128.ll
index f74343d7f2a..208fab88b58 100644
--- a/test/CodeGen/X86/var-permute-128.ll
+++ b/test/CodeGen/X86/var-permute-128.ll
@@ -143,35 +143,40 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: retq
;
-; AVX-LABEL: var_shuffle_v8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vmovd %xmm1, %eax
-; AVX-NEXT: vpextrw $1, %xmm1, %r10d
-; AVX-NEXT: vpextrw $2, %xmm1, %ecx
-; AVX-NEXT: vpextrw $3, %xmm1, %edx
-; AVX-NEXT: vpextrw $4, %xmm1, %esi
-; AVX-NEXT: vpextrw $5, %xmm1, %edi
-; AVX-NEXT: vpextrw $6, %xmm1, %r8d
-; AVX-NEXT: vpextrw $7, %xmm1, %r9d
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: andl $7, %eax
-; AVX-NEXT: andl $7, %r10d
-; AVX-NEXT: andl $7, %ecx
-; AVX-NEXT: andl $7, %edx
-; AVX-NEXT: andl $7, %esi
-; AVX-NEXT: andl $7, %edi
-; AVX-NEXT: andl $7, %r8d
-; AVX-NEXT: andl $7, %r9d
-; AVX-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0
-; AVX-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVXNOVLBW-LABEL: var_shuffle_v8i16:
+; AVXNOVLBW: # BB#0:
+; AVXNOVLBW-NEXT: vmovd %xmm1, %eax
+; AVXNOVLBW-NEXT: vpextrw $1, %xmm1, %r10d
+; AVXNOVLBW-NEXT: vpextrw $2, %xmm1, %ecx
+; AVXNOVLBW-NEXT: vpextrw $3, %xmm1, %edx
+; AVXNOVLBW-NEXT: vpextrw $4, %xmm1, %esi
+; AVXNOVLBW-NEXT: vpextrw $5, %xmm1, %edi
+; AVXNOVLBW-NEXT: vpextrw $6, %xmm1, %r8d
+; AVXNOVLBW-NEXT: vpextrw $7, %xmm1, %r9d
+; AVXNOVLBW-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVXNOVLBW-NEXT: andl $7, %eax
+; AVXNOVLBW-NEXT: andl $7, %r10d
+; AVXNOVLBW-NEXT: andl $7, %ecx
+; AVXNOVLBW-NEXT: andl $7, %edx
+; AVXNOVLBW-NEXT: andl $7, %esi
+; AVXNOVLBW-NEXT: andl $7, %edi
+; AVXNOVLBW-NEXT: andl $7, %r8d
+; AVXNOVLBW-NEXT: andl $7, %r9d
+; AVXNOVLBW-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVXNOVLBW-NEXT: vmovd %eax, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v8i16:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermw %xmm0, %xmm1, %xmm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <8 x i16> %indices, i32 0
%index1 = extractelement <8 x i16> %indices, i32 1
%index2 = extractelement <8 x i16> %indices, i32 2
@@ -202,143 +207,13 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSSE3-LABEL: var_shuffle_v16i8:
; SSSE3: # BB#0:
-; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm8
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm15
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm9
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm3
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm10
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm7
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm11
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm6
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm12
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm5
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm13
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm4
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm14
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: andl $15, %ecx
-; SSSE3-NEXT: movzbl (%rcx,%rax), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: var_shuffle_v16i8:
; AVX: # BB#0:
-; AVX-NEXT: vpextrb $0, %xmm1, %eax
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movzbl (%rax,%rcx), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpextrb $1, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $1, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $2, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $3, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $4, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $5, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $6, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $7, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $8, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $9, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $10, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $11, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $12, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $13, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $14, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0
-; AVX-NEXT: vpextrb $15, %xmm1, %eax
-; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%index0 = extractelement <16 x i8> %indices, i32 0
%index1 = extractelement <16 x i8> %indices, i32 1
diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll
index dff145314ea..beef4643c13 100644
--- a/test/CodeGen/X86/var-permute-256.ll
+++ b/test/CodeGen/X86/var-permute-256.ll
@@ -34,32 +34,69 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v4i64:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: andl $3, %eax
-; INT256-NEXT: vpextrq $1, %xmm1, %rcx
-; INT256-NEXT: andl $3, %ecx
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vmovq %xmm1, %rdx
-; INT256-NEXT: andl $3, %edx
-; INT256-NEXT: vpextrq $1, %xmm1, %rsi
-; INT256-NEXT: andl $3, %esi
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; INT256-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; INT256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; INT256-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v4i64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: andl $3, %eax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v4i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: andl $3, %eax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: andl $3, %ecx
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rdx
+; AVX512F-NEXT: andl $3, %edx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT: andl $3, %esi
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v4i64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v4i64:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <4 x i64> %indices, i32 0
%index1 = extractelement <4 x i64> %indices, i32 1
%index2 = extractelement <4 x i64> %indices, i32 2
@@ -120,44 +157,7 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
;
; INT256-LABEL: var_shuffle_v8i32:
; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vpextrq $1, %xmm1, %r8
-; INT256-NEXT: movq %r8, %rcx
-; INT256-NEXT: shrq $30, %rcx
-; INT256-NEXT: vmovq %xmm1, %r9
-; INT256-NEXT: movq %r9, %rsi
-; INT256-NEXT: shrq $30, %rsi
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vpextrq $1, %xmm1, %r10
-; INT256-NEXT: movq %r10, %rdi
-; INT256-NEXT: shrq $30, %rdi
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: movq %rax, %rdx
-; INT256-NEXT: shrq $30, %rdx
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $7, %r9d
-; INT256-NEXT: andl $28, %esi
-; INT256-NEXT: andl $7, %r8d
-; INT256-NEXT: andl $28, %ecx
-; INT256-NEXT: andl $7, %eax
-; INT256-NEXT: andl $28, %edx
-; INT256-NEXT: andl $7, %r10d
-; INT256-NEXT: andl $28, %edi
-; INT256-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; INT256-NEXT: movq %rsp, %rax
-; INT256-NEXT: vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0
-; INT256-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
-; INT256-NEXT: vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0
-; INT256-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; INT256-NEXT: vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1
-; INT256-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
-; INT256-NEXT: vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1
-; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
; INT256-NEXT: retq
%index0 = extractelement <8 x i32> %indices, i32 0
%index1 = extractelement <8 x i32> %indices, i32 1
@@ -250,68 +250,199 @@ define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwi
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v16i16:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm2
-; INT256-NEXT: vmovd %xmm2, %eax
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: movzwl (%rsp,%rax,2), %eax
-; INT256-NEXT: vmovd %eax, %xmm0
-; INT256-NEXT: vpextrw $1, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $2, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $3, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $4, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $5, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $6, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vpextrw $7, %xmm2, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
-; INT256-NEXT: vmovd %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: movzwl (%rsp,%rax,2), %eax
-; INT256-NEXT: vmovd %eax, %xmm2
-; INT256-NEXT: vpextrw $1, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $2, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $3, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $4, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $5, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $6, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
-; INT256-NEXT: vpextrw $7, %xmm1, %eax
-; INT256-NEXT: andl $15, %eax
-; INT256-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
-; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v16i16:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpextrw $1, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $2, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $3, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $4, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $5, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $6, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpextrw $1, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $2, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $3, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $4, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $5, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $6, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $7, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v16i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $2, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $4, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $5, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v16i16:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $64, %rsp
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovd %xmm2, %eax
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $2, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $4, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $5, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $6, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v16i16:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <16 x i16> %indices, i32 0
%index1 = extractelement <16 x i16> %indices, i32 1
%index2 = extractelement <16 x i16> %indices, i32 2
@@ -492,133 +623,394 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v32i8:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm2
-; INT256-NEXT: vpextrb $0, %xmm2, %eax
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movq %rsp, %rcx
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vmovd %eax, %xmm0
-; INT256-NEXT: vpextrb $1, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $2, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $3, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $4, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $5, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $6, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $7, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $8, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $9, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $10, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $11, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $12, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $13, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $14, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $15, %xmm2, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; INT256-NEXT: vpextrb $0, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vmovd %eax, %xmm2
-; INT256-NEXT: vpextrb $1, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $2, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $3, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $4, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $5, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $6, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $7, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $8, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $9, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $10, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $11, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $12, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $13, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $14, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
-; INT256-NEXT: vpextrb $15, %xmm1, %eax
-; INT256-NEXT: andl $31, %eax
-; INT256-NEXT: movzbl (%rax,%rcx), %eax
-; INT256-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; INT256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v32i8:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movq %rsp, %rcx
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v32i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movq %rsp, %rcx
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v32i8:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $64, %rsp
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movq %rsp, %rcx
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: retq
+;
+; VBMI-LABEL: var_shuffle_v32i8:
+; VBMI: # BB#0:
+; VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; VBMI-NEXT: retq
%index0 = extractelement <32 x i8> %indices, i32 0
%index1 = extractelement <32 x i8> %indices, i32 1
%index2 = extractelement <32 x i8> %indices, i32 2
@@ -744,30 +1136,65 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
-; INT256-LABEL: var_shuffle_v4f64:
-; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: andl $3, %eax
-; INT256-NEXT: vpextrq $1, %xmm1, %rcx
-; INT256-NEXT: andl $3, %ecx
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vmovq %xmm1, %rdx
-; INT256-NEXT: andl $3, %edx
-; INT256-NEXT: vpextrq $1, %xmm1, %rsi
-; INT256-NEXT: andl $3, %esi
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; INT256-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; INT256-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; INT256-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
-; INT256-NEXT: retq
+; AVX2-LABEL: var_shuffle_v4f64:
+; AVX2: # BB#0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: andl $3, %eax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v4f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: andl $3, %eax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: andl $3, %ecx
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rdx
+; AVX512F-NEXT: andl $3, %edx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT: andl $3, %esi
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v4f64:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v4f64:
+; AVX512VLBW: # BB#0:
+; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
%index0 = extractelement <4 x i64> %indices, i32 0
%index1 = extractelement <4 x i64> %indices, i32 1
%index2 = extractelement <4 x i64> %indices, i32 2
@@ -828,44 +1255,7 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
;
; INT256-LABEL: var_shuffle_v8f32:
; INT256: # BB#0:
-; INT256-NEXT: pushq %rbp
-; INT256-NEXT: movq %rsp, %rbp
-; INT256-NEXT: andq $-32, %rsp
-; INT256-NEXT: subq $64, %rsp
-; INT256-NEXT: vpextrq $1, %xmm1, %r8
-; INT256-NEXT: movq %r8, %rcx
-; INT256-NEXT: shrq $30, %rcx
-; INT256-NEXT: vmovq %xmm1, %r9
-; INT256-NEXT: movq %r9, %rdx
-; INT256-NEXT: shrq $30, %rdx
-; INT256-NEXT: vextracti128 $1, %ymm1, %xmm1
-; INT256-NEXT: vpextrq $1, %xmm1, %r10
-; INT256-NEXT: movq %r10, %rdi
-; INT256-NEXT: shrq $30, %rdi
-; INT256-NEXT: vmovq %xmm1, %rax
-; INT256-NEXT: movq %rax, %rsi
-; INT256-NEXT: shrq $30, %rsi
-; INT256-NEXT: vmovaps %ymm0, (%rsp)
-; INT256-NEXT: andl $7, %r9d
-; INT256-NEXT: andl $28, %edx
-; INT256-NEXT: andl $7, %r8d
-; INT256-NEXT: andl $28, %ecx
-; INT256-NEXT: andl $7, %eax
-; INT256-NEXT: andl $28, %esi
-; INT256-NEXT: andl $7, %r10d
-; INT256-NEXT: andl $28, %edi
-; INT256-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; INT256-NEXT: movq %rsp, %rax
-; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; INT256-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; INT256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; INT256-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; INT256-NEXT: movq %rbp, %rsp
-; INT256-NEXT: popq %rbp
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
; INT256-NEXT: retq
%index0 = extractelement <8 x i32> %indices, i32 0
%index1 = extractelement <8 x i32> %indices, i32 1
diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll
index bd1f220ceb1..15c7a1c8b8b 100644
--- a/test/CodeGen/X86/var-permute-512.ll
+++ b/test/CodeGen/X86/var-permute-512.ll
@@ -6,47 +6,7 @@
define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v8i64:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vmovq %xmm1, %r8
-; AVX512-NEXT: andl $7, %r8d
-; AVX512-NEXT: vpextrq $1, %xmm1, %r9
-; AVX512-NEXT: andl $7, %r9d
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %r10
-; AVX512-NEXT: andl $7, %r10d
-; AVX512-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512-NEXT: andl $7, %esi
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rdi
-; AVX512-NEXT: andl $7, %edi
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: andl $7, %eax
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rcx
-; AVX512-NEXT: andl $7, %ecx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: andl $7, %edx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <8 x i64> %indices, i32 0
%index1 = extractelement <8 x i64> %indices, i32 1
@@ -78,76 +38,7 @@ define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind {
define <16 x i32> @var_shuffle_v16i32(<16 x i32> %v, <16 x i32> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v16i32:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rax
-; AVX512-NEXT: vmovq %xmm4, %rdx
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: andl $15, %edx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: movq %rsp, %rdx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm0, %xmm0
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm0, %xmm0
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm3
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm3, %xmm3
-; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm3, %xmm3
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm2
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm2, %xmm2
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm2, %xmm2
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $1, (%rcx,%rdx), %xmm4, %xmm1
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vpinsrd $2, (%rsp,%rax,4), %xmm1, %xmm1
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vpinsrd $3, (%rcx,%rdx), %xmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <16 x i32> %indices, i32 0
%index1 = extractelement <16 x i32> %indices, i32 1
@@ -381,136 +272,7 @@ define <32 x i16> @var_shuffle_v32i16(<32 x i16> %v, <32 x i16> %indices) nounwi
;
; AVX512BW-LABEL: var_shuffle_v32i16:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vmovd %xmm4, %eax
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm0
-; AVX512BW-NEXT: vpextrw $1, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $2, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $3, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $4, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $5, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $6, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrw $7, %xmm4, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $2, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $3, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $4, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $5, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $6, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm4, %xmm3
-; AVX512BW-NEXT: vmovd %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $2, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $3, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $5, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $5, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $6, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm4, %xmm2
-; AVX512BW-NEXT: vmovd %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $2, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $4, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $5, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512BW-NEXT: andl $31, %eax
-; AVX512BW-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
%index0 = extractelement <32 x i16> %indices, i32 0
%index1 = extractelement <32 x i16> %indices, i32 1
@@ -1014,267 +776,10 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
; NOBW-NEXT: popq %rbp
; NOBW-NEXT: retq
;
-; AVX512BW-LABEL: var_shuffle_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrb $0, %xmm4, %edx
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movq %rsp, %rsi
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vmovd %edx, %xmm0
-; AVX512BW-NEXT: vpextrb $1, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $2, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $3, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $4, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $5, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $6, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $7, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $8, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $9, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $10, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $11, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $12, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $13, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $14, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $15, %xmm4, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vmovd %edx, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm3, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm4, %xmm3
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vmovd %edx, %xmm4
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $3, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $4, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $5, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $6, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $7, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $8, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $9, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $10, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $11, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $12, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: vpinsrb $15, %edx, %xmm4, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: movzbl (%rcx,%rsi), %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm4
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $1, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $2, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $3, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $4, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $5, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $6, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $7, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $8, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $9, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: vpinsrb $10, (%rax,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: vpinsrb $11, (%rdx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: vpinsrb $12, (%rcx,%rsi), %xmm4, %xmm4
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %ecx
-; AVX512BW-NEXT: andl $63, %eax
-; AVX512BW-NEXT: andl $63, %edx
-; AVX512BW-NEXT: andl $63, %ecx
-; AVX512BW-NEXT: movzbl (%rcx,%rsi), %ecx
-; AVX512BW-NEXT: movzbl (%rdx,%rsi), %edx
-; AVX512BW-NEXT: movzbl (%rax,%rsi), %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm4, %xmm1
-; AVX512BW-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1
-; AVX512BW-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: retq
+; VBMI-LABEL: var_shuffle_v64i8:
+; VBMI: # BB#0:
+; VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; VBMI-NEXT: retq
%index0 = extractelement <64 x i8> %indices, i32 0
%index1 = extractelement <64 x i8> %indices, i32 1
%index2 = extractelement <64 x i8> %indices, i32 2
@@ -1473,43 +978,7 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v8f64:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vmovq %xmm1, %r8
-; AVX512-NEXT: andl $7, %r8d
-; AVX512-NEXT: vpextrq $1, %xmm1, %r9
-; AVX512-NEXT: andl $7, %r9d
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %r10
-; AVX512-NEXT: andl $7, %r10d
-; AVX512-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512-NEXT: andl $7, %esi
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512-NEXT: vmovq %xmm2, %rdi
-; AVX512-NEXT: andl $7, %edi
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: andl $7, %eax
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512-NEXT: vmovq %xmm1, %rcx
-; AVX512-NEXT: andl $7, %ecx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512-NEXT: andl $7, %edx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <8 x i64> %indices, i32 0
%index1 = extractelement <8 x i64> %indices, i32 1
@@ -1541,76 +1010,7 @@ define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) noun
define <16 x float> @var_shuffle_v16f32(<16 x float> %v, <16 x i32> %indices) nounwind {
; AVX512-LABEL: var_shuffle_v16f32:
; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rax
-; AVX512-NEXT: vmovq %xmm4, %rdx
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: andl $15, %edx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: movq %rsp, %rdx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX512-NEXT: vmovq %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],mem[0],xmm4[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
-; AVX512-NEXT: vmovq %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm4[0],mem[0],xmm4[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; AVX512-NEXT: vmovq %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],mem[0],xmm4[2,3]
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andl $15, %eax
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; AVX512-NEXT: shrq $30, %rcx
-; AVX512-NEXT: andl $60, %ecx
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
%index0 = extractelement <16 x i32> %indices, i32 0
%index1 = extractelement <16 x i32> %indices, i32 1
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index c6335d751ed..2f52bab2803 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -2288,67 +2288,19 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
; VEX-NEXT: popq %rax
; VEX-NEXT: retq
;
-; AVX512F-LABEL: fptosi_2f16_to_4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvttss2si %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f16_to_4i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f16_to_4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512DQ-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512DQ-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512DQ-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512DQ-NEXT: vcvttss2si %xmm1, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VLDQ-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VLDQ-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm1
-; AVX512VLDQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
-; AVX512VLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VLDQ-NEXT: retq
+; AVX512-LABEL: fptosi_2f16_to_4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm1, %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm0, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512-NEXT: retq
%cvt = fptosi <2 x half> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i32> %ext
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
index 6e664ba98d9..9feff88a576 100644
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
;
@@ -9,35 +9,12 @@
;
define float @cvt_i16_to_f32(i16 %a0) nounwind {
-; AVX1-LABEL: cvt_i16_to_f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_i16_to_f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_i16_to_f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_i16_to_f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to float
ret float %2
@@ -111,19 +88,18 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4i16_to_4f32:
@@ -222,19 +198,18 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_4f32:
@@ -271,201 +246,54 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
}
define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
-; AVX1-LABEL: cvt_8i16_to_8f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: movswl %dx, %r9d
-; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX1-NEXT: shrl $16, %edx
-; AVX1-NEXT: shrq $32, %r8
-; AVX1-NEXT: shrq $48, %r10
-; AVX1-NEXT: vmovq %xmm0, %rdi
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: movq %rdi, %rsi
-; AVX1-NEXT: movswl %di, %ecx
-; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX1-NEXT: shrl $16, %edi
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: shrq $48, %rsi
-; AVX1-NEXT: movswl %si, %esi
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: cwtl
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: movswl %r10w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: movswl %r8w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl %dx, %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: vmovd %r9d, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_8i16_to_8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: movswl %dx, %r9d
-; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX2-NEXT: shrl $16, %edx
-; AVX2-NEXT: shrq $32, %r8
-; AVX2-NEXT: shrq $48, %r10
-; AVX2-NEXT: vmovq %xmm0, %rdi
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movq %rdi, %rsi
-; AVX2-NEXT: movswl %di, %ecx
-; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX2-NEXT: shrl $16, %edi
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: shrq $48, %rsi
-; AVX2-NEXT: movswl %si, %esi
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: cwtl
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: vmovd %ecx, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: movswl %r10w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: movswl %r8w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl %dx, %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: vmovd %r9d, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_8i16_to_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movq %rdx, %r9
-; AVX512F-NEXT: movswl %dx, %r10d
-; AVX512F-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX512F-NEXT: shrl $16, %edx
-; AVX512F-NEXT: shrq $32, %r8
-; AVX512F-NEXT: shrq $48, %r9
-; AVX512F-NEXT: vmovq %xmm0, %rdi
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq %rdi, %rcx
-; AVX512F-NEXT: movswl %di, %esi
-; AVX512F-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl %r9w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl %r8w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl %dx, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: vmovd %r10d, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_8f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: movq %rdx, %r8
-; AVX512VL-NEXT: movq %rdx, %r10
-; AVX512VL-NEXT: movswl %dx, %r9d
-; AVX512VL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX512VL-NEXT: shrl $16, %edx
-; AVX512VL-NEXT: shrq $32, %r8
-; AVX512VL-NEXT: shrq $48, %r10
-; AVX512VL-NEXT: vmovq %xmm0, %rdi
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %rdi, %rsi
-; AVX512VL-NEXT: movswl %di, %ecx
-; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX512VL-NEXT: shrl $16, %edi
-; AVX512VL-NEXT: shrq $32, %rax
-; AVX512VL-NEXT: shrq $48, %rsi
-; AVX512VL-NEXT: movswl %si, %esi
-; AVX512VL-NEXT: vmovd %esi, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %ecx, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl %r10w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl %r8w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl %dx, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: vmovd %r9d, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_8i16_to_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: vpextrq $1, %xmm0, %rdx
+; ALL-NEXT: movq %rdx, %r8
+; ALL-NEXT: movq %rdx, %r10
+; ALL-NEXT: movswl %dx, %r9d
+; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: shrq $32, %r8
+; ALL-NEXT: shrq $48, %r10
+; ALL-NEXT: vmovq %xmm0, %rdi
+; ALL-NEXT: movq %rdi, %rax
+; ALL-NEXT: movq %rdi, %rsi
+; ALL-NEXT: movswl %di, %ecx
+; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; ALL-NEXT: shrl $16, %edi
+; ALL-NEXT: shrq $32, %rax
+; ALL-NEXT: shrq $48, %rsi
+; ALL-NEXT: movswl %si, %esi
+; ALL-NEXT: vmovd %esi, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %ecx, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: movswl %r10w, %eax
+; ALL-NEXT: vmovd %eax, %xmm4
+; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
+; ALL-NEXT: movswl %r8w, %eax
+; ALL-NEXT: vmovd %eax, %xmm5
+; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
+; ALL-NEXT: movswl %dx, %eax
+; ALL-NEXT: vmovd %eax, %xmm6
+; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
+; ALL-NEXT: vmovd %r9d, %xmm7
+; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
+; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x float>
ret <8 x float> %2
@@ -664,98 +492,98 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
;
; AVX512F-LABEL: cvt_16i16_to_16f32:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm2
+; AVX512F-NEXT: vmovd %ecx, %xmm8
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm3
+; AVX512F-NEXT: vmovd %ecx, %xmm9
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vmovd %eax, %xmm11
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vmovd %ecx, %xmm12
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm5
+; AVX512F-NEXT: vmovd %ecx, %xmm13
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm6
+; AVX512F-NEXT: vmovd %ecx, %xmm14
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm8
+; AVX512F-NEXT: vmovd %eax, %xmm15
+; AVX512F-NEXT: vmovq %xmm10, %rax
+; AVX512F-NEXT: vmovd %ecx, %xmm2
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm9
+; AVX512F-NEXT: vmovd %ecx, %xmm3
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm10
+; AVX512F-NEXT: vmovd %ecx, %xmm1
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm11
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vpextrq $1, %xmm10, %rax
+; AVX512F-NEXT: vmovd %ecx, %xmm10
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm12
+; AVX512F-NEXT: vmovd %ecx, %xmm5
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm13
+; AVX512F-NEXT: vmovd %ecx, %xmm6
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: shrl $16, %ecx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm14
+; AVX512F-NEXT: vmovd %ecx, %xmm7
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm15
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm16
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
-; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
-; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
-; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
-; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
-; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
-; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm9[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm16[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_16i16_to_16f32:
@@ -863,35 +691,12 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
;
define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
-; AVX1-LABEL: load_cvt_i16_to_f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_i16_to_f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_i16_to_f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_i16_to_f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_i16_to_f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to float
@@ -899,82 +704,24 @@ define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
}
define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_4i16_to_4f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_4i16_to_4f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_4i16_to_4f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_4i16_to_4f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_4i16_to_4f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x float>
@@ -1046,19 +793,18 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
@@ -1096,145 +842,40 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
}
define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_8i16_to_8f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: movswl 14(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: movswl 12(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl 8(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: movswl 10(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_8i16_to_8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: movswl 14(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: movswl 12(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl 8(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: movswl 10(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_8i16_to_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_8f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl 14(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl 12(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl 8(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl 10(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_8i16_to_8f32:
+; ALL: # BB#0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: movswl 14(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm4
+; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
+; ALL-NEXT: movswl 12(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm5
+; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
+; ALL-NEXT: movswl 8(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm6
+; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
+; ALL-NEXT: movswl 10(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm7
+; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
+; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x float>
@@ -1378,65 +1019,65 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
; AVX512F: # BB#0:
; AVX512F-NEXT: movswl 6(%rdi), %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm16
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8
; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm17
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9
; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10
; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11
; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12
; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13
; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14
; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15
; AVX512F-NEXT: movswl 22(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm8
-; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl 20(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm9
-; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl 16(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm10
-; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: movswl 18(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm11
-; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
+; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: movswl 30(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm12
-; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
+; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
; AVX512F-NEXT: movswl 28(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm13
-; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
+; AVX512F-NEXT: vmovd %eax, %xmm5
+; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
; AVX512F-NEXT: movswl 24(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm14
-; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
+; AVX512F-NEXT: vmovd %eax, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX512F-NEXT: movswl 26(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm15
-; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; AVX512F-NEXT: vmovd %eax, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm17[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm16[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
@@ -1518,38 +1159,13 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
;
define double @cvt_i16_to_f64(i16 %a0) nounwind {
-; AVX1-LABEL: cvt_i16_to_f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_i16_to_f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_i16_to_f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_i16_to_f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to double
ret double %2
@@ -1599,13 +1215,12 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_2i16_to_2f64:
@@ -1701,15 +1316,15 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -1791,13 +1406,12 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_2f64:
@@ -1892,15 +1506,15 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -1950,25 +1564,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_8f64:
; AVX1: # BB#0:
; AVX1-NEXT: vmovq %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %r9
; AVX1-NEXT: movl %edx, %r10d
-; AVX1-NEXT: movswl %dx, %r9d
+; AVX1-NEXT: movswl %dx, %r8d
; AVX1-NEXT: shrq $48, %rdx
-; AVX1-NEXT: shrq $32, %r8
+; AVX1-NEXT: shrq $32, %r9
; AVX1-NEXT: shrl $16, %r10d
; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: movl %edi, %esi
+; AVX1-NEXT: movq %rdi, %rsi
+; AVX1-NEXT: movl %edi, %eax
; AVX1-NEXT: movswl %di, %ecx
; AVX1-NEXT: shrq $48, %rdi
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: shrl $16, %esi
-; AVX1-NEXT: movswl %si, %esi
-; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: shrq $32, %rsi
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX1-NEXT: cwtl
+; AVX1-NEXT: movswl %si, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX1-NEXT: movswl %di, %eax
@@ -1977,9 +1591,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-NEXT: movswl %r10w, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vmovd %r9d, %xmm5
+; AVX1-NEXT: vmovd %r8d, %xmm5
; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl %r8w, %eax
+; AVX1-NEXT: movswl %r9w, %eax
; AVX1-NEXT: vmovd %eax, %xmm6
; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX1-NEXT: movswl %dx, %eax
@@ -2004,25 +1618,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-LABEL: cvt_8i16_to_8f64:
; AVX2: # BB#0:
; AVX2-NEXT: vmovq %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %r9
; AVX2-NEXT: movl %edx, %r10d
-; AVX2-NEXT: movswl %dx, %r9d
+; AVX2-NEXT: movswl %dx, %r8d
; AVX2-NEXT: shrq $48, %rdx
-; AVX2-NEXT: shrq $32, %r8
+; AVX2-NEXT: shrq $32, %r9
; AVX2-NEXT: shrl $16, %r10d
; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movl %edi, %esi
+; AVX2-NEXT: movq %rdi, %rsi
+; AVX2-NEXT: movl %edi, %eax
; AVX2-NEXT: movswl %di, %ecx
; AVX2-NEXT: shrq $48, %rdi
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: shrl $16, %esi
-; AVX2-NEXT: movswl %si, %esi
-; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: shrq $32, %rsi
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX2-NEXT: vmovd %ecx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX2-NEXT: cwtl
+; AVX2-NEXT: movswl %si, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX2-NEXT: movswl %di, %eax
@@ -2031,9 +1645,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: movswl %r10w, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vmovd %r9d, %xmm5
+; AVX2-NEXT: vmovd %r8d, %xmm5
; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl %r8w, %eax
+; AVX2-NEXT: movswl %r9w, %eax
; AVX2-NEXT: vmovd %eax, %xmm6
; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX2-NEXT: movswl %dx, %eax
@@ -2055,115 +1669,60 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_8i16_to_8f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movl %edx, %r9d
-; AVX512F-NEXT: movswl %dx, %r10d
-; AVX512F-NEXT: shrq $48, %rdx
-; AVX512F-NEXT: shrq $32, %r8
-; AVX512F-NEXT: shrl $16, %r9d
-; AVX512F-NEXT: vmovq %xmm0, %rdi
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movl %edi, %ecx
-; AVX512F-NEXT: movswl %di, %esi
-; AVX512F-NEXT: shrq $48, %rdi
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrl $16, %ecx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl %r9w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: vmovd %r10d, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl %r8w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl %dx, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_8f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: movq %rdx, %r8
-; AVX512VL-NEXT: movl %edx, %r10d
-; AVX512VL-NEXT: movswl %dx, %r9d
-; AVX512VL-NEXT: shrq $48, %rdx
-; AVX512VL-NEXT: shrq $32, %r8
-; AVX512VL-NEXT: shrl $16, %r10d
-; AVX512VL-NEXT: vmovq %xmm0, %rdi
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movl %edi, %esi
-; AVX512VL-NEXT: movswl %di, %ecx
-; AVX512VL-NEXT: shrq $48, %rdi
-; AVX512VL-NEXT: shrq $32, %rax
-; AVX512VL-NEXT: shrl $16, %esi
-; AVX512VL-NEXT: movswl %si, %esi
-; AVX512VL-NEXT: vmovd %esi, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %ecx, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl %r10w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: vmovd %r9d, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl %r8w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl %dx, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: movq %rdx, %r9
+; AVX512-NEXT: movl %edx, %r10d
+; AVX512-NEXT: movswl %dx, %r8d
+; AVX512-NEXT: shrq $48, %rdx
+; AVX512-NEXT: shrq $32, %r9
+; AVX512-NEXT: shrl $16, %r10d
+; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: movq %rdi, %rsi
+; AVX512-NEXT: movl %edi, %eax
+; AVX512-NEXT: movswl %di, %ecx
+; AVX512-NEXT: shrq $48, %rdi
+; AVX512-NEXT: shrq $32, %rsi
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl %si, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: vmovd %r8d, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl %r9w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x double>
ret <8 x double> %2
@@ -2174,38 +1733,13 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
;
define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
-; AVX1-LABEL: load_cvt_i16_to_f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_i16_to_f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_i16_to_f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_i16_to_f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_i16_to_f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to double
@@ -2213,58 +1747,18 @@ define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
}
define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_2i16_to_2f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_2i16_to_2f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_2i16_to_2f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_2i16_to_2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_2i16_to_2f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: retq
%1 = load <2 x i16>, <2 x i16>* %a0
%2 = bitcast <2 x i16> %1 to <2 x half>
%3 = fpext <2 x half> %2 to <2 x double>
@@ -2272,97 +1766,28 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
}
define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_4i16_to_4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_4i16_to_4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_4i16_to_4f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_4i16_to_4f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_4i16_to_4f64:
+; ALL: # BB#0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x double>
@@ -2439,15 +1864,15 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -2579,91 +2004,48 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: load_cvt_8i16_to_8f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_8f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl 8(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl 10(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl 12(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl 14(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: load_cvt_8i16_to_8f64:
+; AVX512: # BB#0:
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x double>
@@ -2675,138 +2057,41 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
;
define i16 @cvt_f32_to_i16(float %a0) nounwind {
-; AVX1-LABEL: cvt_f32_to_i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_f32_to_i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_f32_to_i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_f32_to_i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
ret i16 %2
}
define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_4f32_to_4i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %eax, %edx
-; AVX1-NEXT: shlq $32, %rdx
-; AVX1-NEXT: orq %rcx, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_4f32_to_4i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %eax, %edx
-; AVX2-NEXT: shlq $32, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_4f32_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
-; AVX512F-NEXT: orl %eax, %edx
-; AVX512F-NEXT: shlq $32, %rdx
-; AVX512F-NEXT: orq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_4f32_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %eax, %edx
-; AVX512VL-NEXT: shlq $32, %rdx
-; AVX512VL-NEXT: orq %rcx, %rdx
-; AVX512VL-NEXT: vmovq %rdx, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
ret <4 x i16> %2
@@ -2865,29 +2150,27 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
;
; AVX512F-LABEL: cvt_4f32_to_8i16_undef:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
@@ -2974,29 +2257,27 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
;
; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
@@ -3033,194 +2314,52 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
}
define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_8f32_to_8i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: shll $16, %edx
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: orl %edx, %eax
-; AVX1-NEXT: shlq $32, %rax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shll $16, %ecx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %ecx, %edx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shll $16, %ecx
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movzwl %si, %esi
-; AVX1-NEXT: orl %ecx, %esi
-; AVX1-NEXT: shlq $32, %rsi
-; AVX1-NEXT: orq %rdx, %rsi
-; AVX1-NEXT: vmovq %rsi, %xmm0
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_8f32_to_8i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: shll $16, %edx
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: orl %edx, %eax
-; AVX2-NEXT: shlq $32, %rax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: shll $16, %ecx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %ecx, %edx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: shll $16, %ecx
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movzwl %si, %esi
-; AVX2-NEXT: orl %ecx, %esi
-; AVX2-NEXT: shlq $32, %rsi
-; AVX2-NEXT: orq %rdx, %rsi
-; AVX2-NEXT: vmovq %rsi, %xmm0
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_8f32_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %edx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: shll $16, %eax
-; AVX512F-NEXT: orl %edx, %eax
-; AVX512F-NEXT: shlq $32, %rax
-; AVX512F-NEXT: orq %rcx, %rax
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %edx
-; AVX512F-NEXT: shll $16, %edx
-; AVX512F-NEXT: orl %ecx, %edx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: shll $16, %esi
-; AVX512F-NEXT: orl %ecx, %esi
-; AVX512F-NEXT: shlq $32, %rsi
-; AVX512F-NEXT: orq %rdx, %rsi
-; AVX512F-NEXT: vmovq %rsi, %xmm0
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8f32_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: shll $16, %edx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: orl %edx, %eax
-; AVX512VL-NEXT: shlq $32, %rax
-; AVX512VL-NEXT: orq %rcx, %rax
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: shll $16, %ecx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %ecx, %edx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: shll $16, %ecx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movzwl %si, %esi
-; AVX512VL-NEXT: orl %ecx, %esi
-; AVX512VL-NEXT: shlq $32, %rsi
-; AVX512VL-NEXT: orq %rdx, %rsi
-; AVX512VL-NEXT: vmovq %rsi, %xmm0
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_8f32_to_8i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: shll $16, %edx
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: movzwl %ax, %eax
+; ALL-NEXT: orl %edx, %eax
+; ALL-NEXT: shlq $32, %rax
+; ALL-NEXT: orq %rcx, %rax
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: shll $16, %ecx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %ecx, %edx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: shll $16, %ecx
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movzwl %si, %esi
+; ALL-NEXT: orl %ecx, %esi
+; ALL-NEXT: shlq $32, %rsi
+; ALL-NEXT: orq %rdx, %rsi
+; ALL-NEXT: vmovq %rsi, %xmm0
+; ALL-NEXT: vmovq %rax, %xmm1
+; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
ret <8 x i16> %2
@@ -3361,141 +2500,73 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_16f32_to_16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm1
-; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
-; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_16f32_to_16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
ret <16 x i16> %2
@@ -3506,35 +2577,12 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
;
define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
-; AVX1-LABEL: store_cvt_f32_to_i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: movw %ax, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_f32_to_i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: movw %ax, (%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_f32_to_i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: movw %ax, (%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_f32_to_i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: movw %ax, (%rdi)
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_f32_to_i16:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movw %ax, (%rdi)
+; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
store i16 %2, i16* %a1
@@ -3542,83 +2590,24 @@ define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
}
define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_4f32_to_4i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movw %si, (%rdi)
-; AVX1-NEXT: movw %dx, 6(%rdi)
-; AVX1-NEXT: movw %cx, 4(%rdi)
-; AVX1-NEXT: movw %ax, 2(%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_4f32_to_4i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movw %si, (%rdi)
-; AVX2-NEXT: movw %dx, 6(%rdi)
-; AVX2-NEXT: movw %cx, 4(%rdi)
-; AVX2-NEXT: movw %ax, 2(%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_4f32_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %edx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: movw %si, (%rdi)
-; AVX512F-NEXT: movw %dx, 6(%rdi)
-; AVX512F-NEXT: movw %cx, 4(%rdi)
-; AVX512F-NEXT: movw %ax, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_4f32_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movw %si, (%rdi)
-; AVX512VL-NEXT: movw %dx, 6(%rdi)
-; AVX512VL-NEXT: movw %cx, 4(%rdi)
-; AVX512VL-NEXT: movw %ax, 2(%rdi)
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_4f32_to_4i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, (%rdi)
+; ALL-NEXT: movw %dx, 6(%rdi)
+; ALL-NEXT: movw %cx, 4(%rdi)
+; ALL-NEXT: movw %ax, 2(%rdi)
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
store <4 x i16> %2, <4 x i16>* %a1
@@ -3680,30 +2669,28 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
;
; AVX512F-LABEL: store_cvt_4f32_to_8i16_undef:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef:
@@ -3794,30 +2781,28 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
;
; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
@@ -3856,150 +2841,41 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
}
define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_8f32_to_8i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r8d
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r9d
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r10d
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %r11d
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %eax
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %ecx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movw %si, 8(%rdi)
-; AVX1-NEXT: movw %dx, (%rdi)
-; AVX1-NEXT: movw %cx, 14(%rdi)
-; AVX1-NEXT: movw %ax, 12(%rdi)
-; AVX1-NEXT: movw %r11w, 10(%rdi)
-; AVX1-NEXT: movw %r10w, 6(%rdi)
-; AVX1-NEXT: movw %r9w, 4(%rdi)
-; AVX1-NEXT: movw %r8w, 2(%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_8f32_to_8i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r8d
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r9d
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r10d
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %r11d
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %eax
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %ecx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movw %si, 8(%rdi)
-; AVX2-NEXT: movw %dx, (%rdi)
-; AVX2-NEXT: movw %cx, 14(%rdi)
-; AVX2-NEXT: movw %ax, 12(%rdi)
-; AVX2-NEXT: movw %r11w, 10(%rdi)
-; AVX2-NEXT: movw %r10w, 6(%rdi)
-; AVX2-NEXT: movw %r9w, 4(%rdi)
-; AVX2-NEXT: movw %r8w, 2(%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_8f32_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r8d
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r9d
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r10d
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %r11d
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %ecx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: movw %si, 8(%rdi)
-; AVX512F-NEXT: movw %dx, (%rdi)
-; AVX512F-NEXT: movw %cx, 14(%rdi)
-; AVX512F-NEXT: movw %ax, 12(%rdi)
-; AVX512F-NEXT: movw %r11w, 10(%rdi)
-; AVX512F-NEXT: movw %r10w, 6(%rdi)
-; AVX512F-NEXT: movw %r9w, 4(%rdi)
-; AVX512F-NEXT: movw %r8w, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_8f32_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r8d
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r9d
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r10d
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %r11d
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %ecx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movw %si, 8(%rdi)
-; AVX512VL-NEXT: movw %dx, (%rdi)
-; AVX512VL-NEXT: movw %cx, 14(%rdi)
-; AVX512VL-NEXT: movw %ax, 12(%rdi)
-; AVX512VL-NEXT: movw %r11w, 10(%rdi)
-; AVX512VL-NEXT: movw %r10w, 6(%rdi)
-; AVX512VL-NEXT: movw %r9w, 4(%rdi)
-; AVX512VL-NEXT: movw %r8w, 2(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_8f32_to_8i16:
+; ALL: # BB#0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r8d
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r9d
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r10d
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %r11d
+; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %eax
+; ALL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %ecx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, 8(%rdi)
+; ALL-NEXT: movw %dx, (%rdi)
+; ALL-NEXT: movw %cx, 14(%rdi)
+; ALL-NEXT: movw %ax, 12(%rdi)
+; ALL-NEXT: movw %r11w, 10(%rdi)
+; ALL-NEXT: movw %r10w, 6(%rdi)
+; ALL-NEXT: movw %r9w, 4(%rdi)
+; ALL-NEXT: movw %r8w, 2(%rdi)
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
store <8 x i16> %2, <8 x i16>* %a1
@@ -4141,141 +3017,73 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: store_cvt_16f32_to_16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm4
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm4
-; AVX512F-NEXT: movw %ax, 24(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm4
-; AVX512F-NEXT: movw %ax, 16(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm4
-; AVX512F-NEXT: movw %ax, 8(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: movw %ax, (%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: movw %ax, 30(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 28(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 26(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 22(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: movw %ax, 20(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: movw %ax, 18(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: movw %ax, 14(%rdi)
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movw %ax, 12(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: movw %ax, 10(%rdi)
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: movw %ax, 6(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: movw %ax, 4(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: movw %ax, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4
-; AVX512VL-NEXT: movw %ax, 24(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm4
-; AVX512VL-NEXT: movw %ax, 16(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm4
-; AVX512VL-NEXT: movw %ax, 8(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: movw %ax, (%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: movw %ax, 30(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 28(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 26(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 22(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: movw %ax, 20(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: movw %ax, 18(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: movw %ax, 14(%rdi)
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movw %ax, 12(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: movw %ax, 10(%rdi)
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: movw %ax, 6(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: movw %ax, 4(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: movw %ax, 2(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: store_cvt_16f32_to_16i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX512-NEXT: movw %ax, 24(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX512-NEXT: movw %ax, 16(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX512-NEXT: movw %ax, 8(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, (%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, 30(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 28(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 26(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 22(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 20(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 18(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: movw %ax, 14(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movw %ax, 12(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: movw %ax, 10(%rdi)
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: movw %ax, 6(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: movw %ax, 4(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: movw %ax, 2(%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
store <16 x i16> %2, <16 x i16>* %a1
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index cd4b237735f..25377f26799 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -3333,11 +3333,17 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: .cfi_def_cfa_offset 48
; AVX1-NEXT: popq %r12
+; AVX1-NEXT: .cfi_def_cfa_offset 40
; AVX1-NEXT: popq %r13
+; AVX1-NEXT: .cfi_def_cfa_offset 32
; AVX1-NEXT: popq %r14
+; AVX1-NEXT: .cfi_def_cfa_offset 24
; AVX1-NEXT: popq %r15
+; AVX1-NEXT: .cfi_def_cfa_offset 16
; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: .cfi_def_cfa_offset 8
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_16i1_to_16i16:
@@ -3424,11 +3430,17 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: .cfi_def_cfa_offset 48
; AVX2-NEXT: popq %r12
+; AVX2-NEXT: .cfi_def_cfa_offset 40
; AVX2-NEXT: popq %r13
+; AVX2-NEXT: .cfi_def_cfa_offset 32
; AVX2-NEXT: popq %r14
+; AVX2-NEXT: .cfi_def_cfa_offset 24
; AVX2-NEXT: popq %r15
+; AVX2-NEXT: .cfi_def_cfa_offset 16
; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: .cfi_def_cfa_offset 8
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_16i1_to_16i16:
@@ -4824,6 +4836,7 @@ define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; X32-SSE41-NEXT: movd %xmm0, %eax
; X32-SSE41-NEXT: popl %ecx
+; X32-SSE41-NEXT: .cfi_def_cfa_offset 4
; X32-SSE41-NEXT: retl
entry:
%Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index dd329d21dc9..7ef5bee5420 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3963,10 +3963,20 @@ define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i
}
define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) {
-; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX1: # BB#0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX2: # BB#0:
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: retq
%ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bc0hi = bitcast <8 x i16> %ahi to <16 x i8>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index cf1aaca4ee2..56567c7e794 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1053,8 +1053,8 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_3254:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
ret <4 x i64> %shuffle
@@ -1075,8 +1075,8 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_3276:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6>
ret <4 x i64> %shuffle
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index b95e7cf008a..e4234c05845 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1789,21 +1789,33 @@ define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_7654fedc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_7654fedc:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_7654fedc:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_fedc7654:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_fedc7654:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_fedc7654:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
@@ -2177,10 +2189,15 @@ define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) {
-; ALL-LABEL: concat_v8i32_4567CDEF_bc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: concat_v8i32_4567CDEF_bc:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: concat_v8i32_4567CDEF_bc:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: retq
%a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%bc0hi = bitcast <4 x i32> %a0hi to <2 x i64>
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 6c980559721..1d17ef109d2 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1165,14 +1165,31 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_01014545:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01014545:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT: retl
+
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01014545_mem(<8 x i64>* %ptr, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_01014545_mem:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01014545_mem:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5]
; AVX512F-32-NEXT: retl
+ %a = load <8 x i64>, <8 x i64>* %ptr
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i64> %shuffle
}
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
index efbe5586747..b107b60cd6d 100644
--- a/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -619,6 +619,7 @@ define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){
; KNL32-NEXT: vpblendvb %ymm3, 8(%ebp), %ymm1, %ymm1
; KNL32-NEXT: movl %ebp, %esp
; KNL32-NEXT: popl %ebp
+; KNL32-NEXT: .cfi_def_cfa %esp, 4
; KNL32-NEXT: retl
entry:
%0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> <i32 64, i32 1, i32 66, i32 3, i32 68, i32 5, i32 70, i32 7, i32 72, i32 9, i32 74, i32 11, i32 76, i32 13, i32 78, i32 15, i32 80, i32 17, i32 82, i32 19, i32 84, i32 21, i32 86, i32 23, i32 88, i32 25, i32 90, i32 27, i32 92, i32 29, i32 94, i32 31, i32 96, i32 33, i32 98, i32 35, i32 100, i32 37, i32 102, i32 39, i32 104, i32 41, i32 106, i32 43, i32 108, i32 45, i32 110, i32 47, i32 112, i32 49, i32 114, i32 51, i32 116, i32 53, i32 118, i32 55, i32 120, i32 57, i32 122, i32 59, i32 124, i32 61, i32 126, i32 63>
@@ -659,6 +660,7 @@ define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){
; KNL32-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15]
; KNL32-NEXT: movl %ebp, %esp
; KNL32-NEXT: popl %ebp
+; KNL32-NEXT: .cfi_def_cfa %esp, 4
; KNL32-NEXT: retl
entry:
%0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index 8d057290085..0e690347a54 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -630,6 +630,7 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: .cfi_def_cfa %rsp, 8
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -662,6 +663,7 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512VL-NEXT: orq %rcx, %rax
; AVX512VL-NEXT: movq %rbp, %rsp
; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: .cfi_def_cfa %rsp, 8
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index dc08d88074d..ac1083ad447 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -813,13 +813,10 @@ define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
;
; AVX2-LABEL: trunc16i32_16i16_lshr:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -947,28 +944,52 @@ entry:
}
define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
-; SSE-LABEL: trunc16i32_16i8_lshr:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: psrld $24, %xmm1
-; SSE-NEXT: psrld $24, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: psrld $24, %xmm3
-; SSE-NEXT: psrld $24, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rax)
-; SSE-NEXT: retq
+; SSE2-LABEL: trunc16i32_16i8_lshr:
+; SSE2: # BB#0: # %entry
+; SSE2-NEXT: psrld $24, %xmm1
+; SSE2-NEXT: psrld $24, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psrld $24, %xmm3
+; SSE2-NEXT: psrld $24, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc16i32_16i8_lshr:
+; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: psrld $24, %xmm1
+; SSSE3-NEXT: psrld $24, %xmm0
+; SSSE3-NEXT: packuswb %xmm1, %xmm0
+; SSSE3-NEXT: psrld $24, %xmm3
+; SSSE3-NEXT: psrld $24, %xmm2
+; SSSE3-NEXT: packuswb %xmm3, %xmm2
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
+; SSSE3-NEXT: movdqu %xmm0, (%rax)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: trunc16i32_16i8_lshr:
+; SSE41: # BB#0: # %entry
+; SSE41-NEXT: psrld $24, %xmm1
+; SSE41-NEXT: psrld $24, %xmm0
+; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: psrld $24, %xmm3
+; SSE41-NEXT: psrld $24, %xmm2
+; SSE41-NEXT: packssdw %xmm3, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm0
+; SSE41-NEXT: movdqu %xmm0, (%rax)
+; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i8_lshr:
; AVX1: # BB#0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
@@ -976,16 +997,12 @@ define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
;
; AVX2-LABEL: trunc16i32_16i8_lshr:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
index 97460b36a74..9bd53c6fbd3 100644
--- a/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -105,10 +105,13 @@ define i32 @test_wide(i128 %a, i128 %b) {
; CHECK-NEXT: # BB#1: # %bb1
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB4_2: # %bb2
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: movl $2, %eax
; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: retl
entry:
%cmp = icmp slt i128 %a, %b
diff --git a/test/CodeGen/X86/x86-framelowering-trap.ll b/test/CodeGen/X86/x86-framelowering-trap.ll
index f1590abcae8..89f4528fb06 100644
--- a/test/CodeGen/X86/x86-framelowering-trap.ll
+++ b/test/CodeGen/X86/x86-framelowering-trap.ll
@@ -6,6 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: pushq
; CHECK: ud2
; CHECK-NEXT: popq
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
define void @bar() {
entry:
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index acad9f771fc..bc6a6ea205c 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1816,6 +1816,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
; AVX1-NEXT: vmovaps %ymm9, 64(%rdi)
; AVX1-NEXT: vmovaps %ymm8, (%rdi)
; AVX1-NEXT: addq $24, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
index 763d764698d..929dafbfc21 100644
--- a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
@@ -20,6 +20,7 @@ define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 {
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: popq %rdx
+; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
call void asm sideeffect "", "~{rax},~{rdx},~{xmm1},~{rdi},~{rsi},~{xmm0}"()
ret i32 4